From 5bba9a2e242a70d517680bced4237ab13d22e73f Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 6 Jun 2023 16:42:58 +1100 Subject: [PATCH 1/3] bump to 0.1.99 --- recipe/meta.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 50671f7..f4bf214 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.1.97" %} +{% set version = "0.1.99" %} package: name: sentencepiece-split @@ -6,7 +6,7 @@ package: source: url: https://github.com/google/sentencepiece/archive/v{{ version }}.tar.gz - sha256: 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b + sha256: 63617eaf56c7a3857597dcd8780461f57dd21381b56a27716ef7d7e02e14ced4 patches: # trying to build both static & shared build seems to break on OSX - patches/0001-do-not-mix-static-shared-builds.patch @@ -29,7 +29,7 @@ source: - patches/0009-create-and-install-CMake-metadata.patch build: - number: 3 + number: 0 requirements: build: From 5a5a205c98cfddf373f775c23d26d958dde15640 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 6 Jun 2023 16:47:57 +1100 Subject: [PATCH 2/3] rebase patches --- recipe/meta.yaml | 6 +- ...0001-do-not-mix-static-shared-builds.patch | 18 +- ...ild-vendored-abseil-libprotobuf-lite.patch | 19 +- ...-absolute-paths-for-CMAKE_INSTALL_-D.patch | 15 +- .../0004-ACTUALLY-use-external-absl.patch | 106 +++++----- ...sp-glue-code-belongs-in-third_party-.patch | 184 ++++++++++-------- ...libs-headers-for-windows-in-setup.py.patch | 41 ++-- ...install-pkg-config-files-on-windows.patch} | 11 +- .../0007-don-t-force-replace-MD-with-MT.patch | 41 ---- ...8-create-and-install-CMake-metadata.patch} | 37 ++-- 10 files changed, 222 insertions(+), 256 deletions(-) rename recipe/patches/{0008-also-install-pkg-config-files-on-windows.patch => 0007-also-install-pkg-config-files-on-windows.patch} (72%) delete mode 100644 recipe/patches/0007-don-t-force-replace-MD-with-MT.patch rename recipe/patches/{0009-create-and-install-CMake-metadata.patch => 0008-create-and-install-CMake-metadata.patch} (79%) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index f4bf214..b04c84f 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -21,12 +21,10 @@ source: - patches/0005-stop-pretending-sp-glue-code-belongs-in-third_party-.patch # ensure python bindings link to correct libs on windows - patches/0006-point-to-our-libs-headers-for-windows-in-setup.py.patch - # we don't want to link the msvc runtime statically - - patches/0007-don-t-force-replace-MD-with-MT.patch # install pkg-config metadata also on windows - - patches/0008-also-install-pkg-config-files-on-windows.patch + - patches/0007-also-install-pkg-config-files-on-windows.patch # install CMake metadata - - patches/0009-create-and-install-CMake-metadata.patch + - patches/0008-create-and-install-CMake-metadata.patch build: number: 0 diff --git a/recipe/patches/0001-do-not-mix-static-shared-builds.patch b/recipe/patches/0001-do-not-mix-static-shared-builds.patch index 3014659..c0e4654 100644 --- a/recipe/patches/0001-do-not-mix-static-shared-builds.patch +++ b/recipe/patches/0001-do-not-mix-static-shared-builds.patch @@ -1,17 +1,17 @@ -From 36b084be62cd5f116749cab4cc0de4206555c628 Mon Sep 17 00:00:00 2001 +From df7925b506acf44eb3f06ce94e812c8a23f83699 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 2 Dec 2021 08:39:53 +1100 -Subject: [PATCH 1/9] do not mix static & shared builds +Subject: [PATCH 1/8] do not mix static & shared builds --- src/CMakeLists.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 6cb3922..0a7f67e 100644 +index 1c7726e..39b0ef2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt -@@ -202,13 +202,13 @@ endif() +@@ -220,18 +220,18 @@ endif() if (SPM_ENABLE_SHARED) add_library(sentencepiece SHARED ${SPM_SRCS}) add_library(sentencepiece_train SHARED ${SPM_TRAIN_SRCS}) @@ -31,16 +31,13 @@ index 6cb3922..0a7f67e 100644 if (SPM_ENABLE_SHARED) target_link_libraries(sentencepiece ${SPM_LIBS}) -@@ -220,7 +220,7 @@ if (SPM_ENABLE_SHARED) - (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4")) - list(APPEND SPM_LIBS "atomic") - endif() + target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) - set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static) + set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train) set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0) set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) set_target_properties(sentencepiece_train PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) -@@ -235,10 +235,10 @@ else() +@@ -246,10 +246,10 @@ else() add_library(sentencepiece ALIAS sentencepiece-static) add_library(sentencepiece_train ALIAS sentencepiece_train-static) set(SPM_INSTALLTARGETS sentencepiece-static sentencepiece_train-static) @@ -54,6 +51,3 @@ index 6cb3922..0a7f67e 100644 if (NOT MSVC) if (SPM_COVERAGE) --- -2.38.1.windows.1 - diff --git a/recipe/patches/0002-do-not-build-vendored-abseil-libprotobuf-lite.patch b/recipe/patches/0002-do-not-build-vendored-abseil-libprotobuf-lite.patch index 1ee80de..673b37b 100644 --- a/recipe/patches/0002-do-not-build-vendored-abseil-libprotobuf-lite.patch +++ b/recipe/patches/0002-do-not-build-vendored-abseil-libprotobuf-lite.patch @@ -1,7 +1,7 @@ -From 285d03b18e98d013f33106694e963ef8e4c2e7bc Mon Sep 17 00:00:00 2001 +From 075ffeaf1de3319784ba18f2b516bb3daf9f8d16 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 2 Dec 2021 10:05:12 +1100 -Subject: [PATCH 2/9] do not build vendored abseil & libprotobuf-lite +Subject: [PATCH 2/8] do not build vendored abseil & libprotobuf-lite --- CMakeLists.txt | 6 +++--- @@ -11,10 +11,10 @@ Subject: [PATCH 2/9] do not build vendored abseil & libprotobuf-lite 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt -index 78379a3..827d114 100644 +index 1b3af04..7e40f5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -57,9 +57,9 @@ add_definitions(-D_FREEBSD) +@@ -68,9 +68,9 @@ add_definitions(-D_FREEBSD) endif() if (SPM_USE_BUILTIN_PROTOBUF) @@ -26,7 +26,7 @@ index 78379a3..827d114 100644 endif() if (MSVC) -@@ -110,7 +110,7 @@ if (SPM_BUILD_TEST) +@@ -147,7 +147,7 @@ if (SPM_BUILD_TEST) endif() if (SPM_USE_EXTERNAL_ABSL) @@ -36,10 +36,10 @@ index 78379a3..827d114 100644 add_subdirectory(src) diff --git a/sentencepiece.pc.in b/sentencepiece.pc.in -index ac7fef6..cc9bf78 100644 +index 6a5ba56..1108973 100644 --- a/sentencepiece.pc.in +++ b/sentencepiece.pc.in -@@ -6,5 +6,5 @@ includedir=@includedir@ +@@ -6,5 +6,5 @@ includedir=@includedir_for_pc_file@ Name: @PROJECT_NAME@ Description: Unsupervised text tokenizer and detokenizer for Neural Network-based text generation. Version: @PROJECT_VERSION@ @@ -47,7 +47,7 @@ index ac7fef6..cc9bf78 100644 +Libs: -L${libdir} -lsentencepiece -lsentencepiece_train @libprotobuf@ @pkgconfiglibs@ Cflags: -I${includedir} @pkgconfigcflags@ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 0a7f67e..d4e2ddc 100644 +index 39b0ef2..6dfd76d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -69,6 +69,11 @@ if (SPM_USE_BUILTIN_PROTOBUF) @@ -72,6 +72,3 @@ index d00ecba..3096702 100644 - - +include_directories(darts_clone esaxx) --- -2.38.1.windows.1 - diff --git a/recipe/patches/0003-consistently-use-absolute-paths-for-CMAKE_INSTALL_-D.patch b/recipe/patches/0003-consistently-use-absolute-paths-for-CMAKE_INSTALL_-D.patch index 212560c..66e3de4 100644 --- a/recipe/patches/0003-consistently-use-absolute-paths-for-CMAKE_INSTALL_-D.patch +++ b/recipe/patches/0003-consistently-use-absolute-paths-for-CMAKE_INSTALL_-D.patch @@ -1,17 +1,17 @@ -From d275c55621f9fea17a1f07c885235e495f4eae34 Mon Sep 17 00:00:00 2001 +From bb0bda21c70a52c388353876a64560b5f6243c7b Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 6 Dec 2021 21:18:54 +1100 -Subject: [PATCH 3/9] consistently use absolute paths for CMAKE_INSTALL_*DIR +Subject: [PATCH 3/8] consistently use absolute paths for CMAKE_INSTALL_*DIR --- CMakeLists.txt | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt -index 827d114..f4f7548 100644 +index 7e40f5c..434530a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -40,11 +40,8 @@ endif() +@@ -51,11 +51,8 @@ endif() if (UNIX) include(GNUInstallDirs) @@ -25,7 +25,7 @@ index 827d114..f4f7548 100644 set(prefix ${CMAKE_INSTALL_PREFIX}) set(exec_prefix "\${prefix}") set(libdir "\${exec_prefix}/lib") -@@ -83,15 +80,15 @@ if (APPLE) +@@ -96,15 +93,15 @@ if (APPLE) endif() if (NOT DEFINED CMAKE_INSTALL_BINDIR) @@ -43,7 +43,4 @@ index 827d114..f4f7548 100644 + set(CMAKE_INSTALL_INCDIR "${CMAKE_INSTALL_PREFIX}/include") endif() - configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") --- -2.38.1.windows.1 - + # SPDX-License-Identifier: (MIT OR CC0-1.0) diff --git a/recipe/patches/0004-ACTUALLY-use-external-absl.patch b/recipe/patches/0004-ACTUALLY-use-external-absl.patch index 12d5b09..3e1784f 100644 --- a/recipe/patches/0004-ACTUALLY-use-external-absl.patch +++ b/recipe/patches/0004-ACTUALLY-use-external-absl.patch @@ -1,7 +1,7 @@ -From d2ae2133a8e718877e9c69002485879ba5847bb9 Mon Sep 17 00:00:00 2001 +From 25b3524ec1fd000bab823d4e67cb25179f71aedd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 21 Feb 2022 10:05:39 +1100 -Subject: [PATCH 4/9] ACTUALLY use external absl +Subject: [PATCH 4/8] ACTUALLY use external absl make include order consistent: - own header @@ -11,7 +11,7 @@ make include order consistent: --- src/CMakeLists.txt | 6 ------ src/bpe_model.cc | 3 ++- - src/bpe_model_trainer.cc | 3 ++- + src/bpe_model_trainer.cc | 7 ++++--- src/bpe_model_trainer.h | 3 ++- src/bpe_model_trainer_test.cc | 5 +++-- src/builder.cc | 9 +++++---- @@ -31,9 +31,9 @@ make include order consistent: src/model_interface_test.cc | 3 ++- src/normalizer.cc | 9 +++++---- src/normalizer.h | 3 ++- - src/pretokenizer_for_training.cc | 4 +++- + src/pretokenizer_for_training.cc | 3 ++- src/pretokenizer_for_training.h | 3 ++- - src/pretokenizer_for_training_test.cc | 4 +++- + src/pretokenizer_for_training_test.cc | 8 +++++--- src/sentencepiece_processor.cc | 17 +++++++++-------- src/sentencepiece_processor.h | 9 ++------- src/sentencepiece_processor_test.cc | 9 +++++---- @@ -56,20 +56,20 @@ make include order consistent: src/unicode_script_test.cc | 3 ++- src/unigram_model.cc | 9 +++++---- src/unigram_model_test.cc | 5 +++-- - src/unigram_model_trainer.cc | 5 +++-- + src/unigram_model_trainer.cc | 9 +++++---- src/unigram_model_trainer.h | 3 ++- src/unigram_model_trainer_test.cc | 5 +++-- src/util.h | 3 ++- src/util_test.cc | 3 ++- src/word_model_trainer.cc | 5 +++-- src/word_model_trainer_test.cc | 5 +++-- - 54 files changed, 163 insertions(+), 120 deletions(-) + 54 files changed, 168 insertions(+), 126 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index d4e2ddc..944f726 100644 +index 6dfd76d..d30a4c2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt -@@ -84,12 +84,6 @@ endif() +@@ -89,12 +89,6 @@ endif() include_directories(${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../third_party) @@ -98,17 +98,21 @@ index bc7ada1..6cb5ab4 100644 namespace sentencepiece { diff --git a/src/bpe_model_trainer.cc b/src/bpe_model_trainer.cc -index 964d44e..d3b9d50 100644 +index de86f14..42d737a 100644 --- a/src/bpe_model_trainer.cc +++ b/src/bpe_model_trainer.cc -@@ -17,8 +17,9 @@ +@@ -19,10 +19,11 @@ #include #include +#include "absl/container/flat_hash_set.h" ++#include "absl/strings/str_join.h" ++#include "absl/strings/str_replace.h" + - #include "bpe_model_trainer.h" + #include "pretokenizer_for_training.h" -#include "third_party/absl/container/flat_hash_set.h" +-#include "third_party/absl/strings/str_join.h" +-#include "third_party/absl/strings/str_replace.h" #include "util.h" namespace sentencepiece { @@ -227,7 +231,7 @@ index 8c2e4b7..b49cea1 100644 namespace sentencepiece { diff --git a/src/common.h b/src/common.h -index ba951d6..b5f2ae8 100644 +index ef5546d..5245004 100644 --- a/src/common.h +++ b/src/common.h @@ -25,8 +25,9 @@ @@ -242,7 +246,7 @@ index ba951d6..b5f2ae8 100644 #if defined(_WIN32) && !defined(__CYGWIN__) #define OS_WIN diff --git a/src/compile_charsmap_main.cc b/src/compile_charsmap_main.cc -index 13bf822..48da74a 100644 +index da15328..23e5ef8 100644 --- a/src/compile_charsmap_main.cc +++ b/src/compile_charsmap_main.cc @@ -18,12 +18,13 @@ @@ -262,10 +266,10 @@ index 13bf822..48da74a 100644 using sentencepiece::normalizer::Builder; diff --git a/src/error.cc b/src/error.cc -index 10faa2d..1cc77b8 100644 +index d3792dc..19ef6f3 100644 --- a/src/error.cc +++ b/src/error.cc -@@ -20,8 +20,8 @@ +@@ -21,8 +21,8 @@ #ifdef _USE_EXTERNAL_ABSL // Naive workaround to define minloglevel on external absl package. // We want to define them in other cc file. @@ -324,7 +328,7 @@ index 790e756..3c9fbdc 100644 namespace sentencepiece { diff --git a/src/init.h b/src/init.h -index 090a2d9..8a557f2 100644 +index 6ae047e..1f4c292 100644 --- a/src/init.h +++ b/src/init.h @@ -15,9 +15,10 @@ @@ -338,8 +342,8 @@ index 090a2d9..8a557f2 100644 -#include "third_party/absl/flags/flag.h" -#include "third_party/absl/flags/parse.h" - ABSL_DECLARE_FLAG(int32, minloglevel); - + #ifdef _USE_EXTERNAL_PROTOBUF + #include "google/protobuf/message_lite.h" diff --git a/src/model_factory.cc b/src/model_factory.cc index be99501..a7ccf0f 100644 --- a/src/model_factory.cc @@ -450,25 +454,25 @@ index c79813c..50b5108 100644 namespace sentencepiece { diff --git a/src/pretokenizer_for_training.cc b/src/pretokenizer_for_training.cc -index 049658e..2e9137d 100644 +index d4f492c..03df377 100644 --- a/src/pretokenizer_for_training.cc +++ b/src/pretokenizer_for_training.cc -@@ -11,10 +11,12 @@ +@@ -11,11 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.! + + #include "pretokenizer_for_training.h" + #include -+#include "absl/strings/str_replace.h" -+ - #include "pretokenizer_for_training.h" -#include "third_party/absl/strings/str_replace.h" ++#include "absl/strings/str_replace.h" namespace sentencepiece { namespace pretokenizer { diff --git a/src/pretokenizer_for_training.h b/src/pretokenizer_for_training.h -index 2d3bc82..d03285e 100644 +index fa54f95..1ec5dfd 100644 --- a/src/pretokenizer_for_training.h +++ b/src/pretokenizer_for_training.h @@ -18,10 +18,11 @@ @@ -485,19 +489,24 @@ index 2d3bc82..d03285e 100644 namespace sentencepiece { namespace pretokenizer { diff --git a/src/pretokenizer_for_training_test.cc b/src/pretokenizer_for_training_test.cc -index 80f4787..5c74645 100644 +index 99db0c5..a968b57 100644 --- a/src/pretokenizer_for_training_test.cc +++ b/src/pretokenizer_for_training_test.cc -@@ -11,9 +11,11 @@ +@@ -11,12 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.! + + #include "pretokenizer_for_training.h" + +#include "absl/strings/str_cat.h" ++#include "absl/strings/str_join.h" ++#include "absl/strings/str_split.h" + - #include "pretokenizer_for_training.h" #include "testharness.h" -#include "third_party/absl/strings/str_cat.h" +-#include "third_party/absl/strings/str_join.h" +-#include "third_party/absl/strings/str_split.h" #include "trainer_interface.h" namespace sentencepiece { @@ -638,7 +647,7 @@ index e44e66b..0bb5aab 100644 namespace sentencepiece { diff --git a/src/spec_parser.h b/src/spec_parser.h -index de8f72f..528c0eb 100644 +index c5f0582..3871621 100644 --- a/src/spec_parser.h +++ b/src/spec_parser.h @@ -18,9 +18,10 @@ @@ -655,7 +664,7 @@ index de8f72f..528c0eb 100644 namespace sentencepiece { diff --git a/src/spm_decode_main.cc b/src/spm_decode_main.cc -index 3382ddc..f3b8269 100644 +index bc49bd3..32fbb76 100644 --- a/src/spm_decode_main.cc +++ b/src/spm_decode_main.cc @@ -16,13 +16,14 @@ @@ -676,7 +685,7 @@ index 3382ddc..f3b8269 100644 ABSL_FLAG(std::string, model, "", "model file name"); diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc -index b0e508d..2199aa4 100644 +index 2fbb850..02def40 100644 --- a/src/spm_encode_main.cc +++ b/src/spm_encode_main.cc @@ -16,15 +16,16 @@ @@ -701,10 +710,10 @@ index b0e508d..2199aa4 100644 ABSL_FLAG(std::string, model, "", "model file name"); diff --git a/src/spm_export_vocab_main.cc b/src/spm_export_vocab_main.cc -index b5d93cb..1603195 100644 +index e5b97df..d0aea7d 100644 --- a/src/spm_export_vocab_main.cc +++ b/src/spm_export_vocab_main.cc -@@ -15,12 +15,13 @@ +@@ -14,12 +14,13 @@ #include @@ -720,7 +729,7 @@ index b5d93cb..1603195 100644 ABSL_FLAG(std::string, output, "", "Output filename"); ABSL_FLAG(std::string, model, "", "input model file name"); diff --git a/src/spm_normalize_main.cc b/src/spm_normalize_main.cc -index 96da360..dca66b1 100644 +index 39f3ef9..7352f63 100644 --- a/src/spm_normalize_main.cc +++ b/src/spm_normalize_main.cc @@ -12,6 +12,8 @@ @@ -741,7 +750,7 @@ index 96da360..dca66b1 100644 ABSL_FLAG(std::string, model, "", "Model file name"); ABSL_FLAG(bool, use_internal_normalization, false, diff --git a/src/spm_train_main.cc b/src/spm_train_main.cc -index c34ee02..84131cf 100644 +index 34369cd..08c519f 100644 --- a/src/spm_train_main.cc +++ b/src/spm_train_main.cc @@ -14,14 +14,15 @@ @@ -815,7 +824,7 @@ index d1d2541..34abe3f 100644 #include "unigram_model_trainer.h" #include "word_model_trainer.h" diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc -index 7270f29..8d30703 100644 +index 968f7b9..27cec4f 100644 --- a/src/trainer_interface.cc +++ b/src/trainer_interface.cc @@ -22,21 +22,22 @@ @@ -870,10 +879,10 @@ index 8d625a9..58425c5 100644 namespace sentencepiece { diff --git a/src/trainer_interface_test.cc b/src/trainer_interface_test.cc -index d6c0c78..6fea564 100644 +index feb970f..75e9f54 100644 --- a/src/trainer_interface_test.cc +++ b/src/trainer_interface_test.cc -@@ -14,10 +14,11 @@ +@@ -16,10 +16,11 @@ #include @@ -884,9 +893,9 @@ index d6c0c78..6fea564 100644 #include "testharness.h" -#include "third_party/absl/strings/str_cat.h" -#include "third_party/absl/strings/str_format.h" - #include "trainer_interface.h" #include "util.h" + namespace sentencepiece { diff --git a/src/unicode_script.cc b/src/unicode_script.cc index 583dc30..5790566 100644 --- a/src/unicode_script.cc @@ -953,7 +962,7 @@ index d9f1ce9..ee085ec 100644 namespace sentencepiece { diff --git a/src/unigram_model_test.cc b/src/unigram_model_test.cc -index 221bac2..275563e 100644 +index bf22da3..567166a 100644 --- a/src/unigram_model_test.cc +++ b/src/unigram_model_test.cc @@ -19,11 +19,12 @@ @@ -972,26 +981,30 @@ index 221bac2..275563e 100644 namespace sentencepiece { diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc -index ee7921c..5fd1962 100644 +index d58c408..baed626 100644 --- a/src/unigram_model_trainer.cc +++ b/src/unigram_model_trainer.cc -@@ -24,11 +24,12 @@ +@@ -24,13 +24,14 @@ #include #include +#include "absl/container/flat_hash_map.h" +#include "absl/memory/memory.h" ++#include "absl/strings/str_replace.h" ++#include "absl/strings/str_split.h" + #include "normalizer.h" #include "pretokenizer_for_training.h" #include "sentencepiece_trainer.h" -#include "third_party/absl/container/flat_hash_map.h" -#include "third_party/absl/memory/memory.h" +-#include "third_party/absl/strings/str_replace.h" +-#include "third_party/absl/strings/str_split.h" #include "third_party/esaxx/esa.hxx" // Suffix array library. #include "unicode_script.h" #include "util.h" diff --git a/src/unigram_model_trainer.h b/src/unigram_model_trainer.h -index 9593e31..871749d 100644 +index c6562e6..0a3f640 100644 --- a/src/unigram_model_trainer.h +++ b/src/unigram_model_trainer.h @@ -20,8 +20,9 @@ @@ -1006,7 +1019,7 @@ index 9593e31..871749d 100644 #include "unigram_model.h" #include "util.h" diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc -index 059772b..4b336ec 100644 +index 9d2c526..ab887b6 100644 --- a/src/unigram_model_trainer_test.cc +++ b/src/unigram_model_trainer_test.cc @@ -17,13 +17,14 @@ @@ -1027,7 +1040,7 @@ index 059772b..4b336ec 100644 namespace sentencepiece { diff --git a/src/util.h b/src/util.h -index 01a561f..f7f2194 100644 +index 5110291..ece08e9 100644 --- a/src/util.h +++ b/src/util.h @@ -28,9 +28,10 @@ @@ -1093,6 +1106,3 @@ index c4a8bc6..8288027 100644 #include "util.h" #include "word_model_trainer.h" --- -2.38.1.windows.1 - diff --git a/recipe/patches/0005-stop-pretending-sp-glue-code-belongs-in-third_party-.patch b/recipe/patches/0005-stop-pretending-sp-glue-code-belongs-in-third_party-.patch index adf84dc..77b407f 100644 --- a/recipe/patches/0005-stop-pretending-sp-glue-code-belongs-in-third_party-.patch +++ b/recipe/patches/0005-stop-pretending-sp-glue-code-belongs-in-third_party-.patch @@ -1,11 +1,10 @@ -From 2d7b8838f812fdc2ebd719d6e0174aaf301c56fe Mon Sep 17 00:00:00 2001 +From 60017d890e3f1211cea3b00120e2498f4755d01d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 21 Feb 2022 12:50:47 +1100 -Subject: [PATCH 5/9] stop pretending sp glue code belongs in third_party/absl +Subject: [PATCH 5/8] stop pretending sp glue code belongs in third_party/absl --- src/CMakeLists.txt | 4 +- - src/bpe_model_trainer.cc | 4 +- src/bpe_model_trainer_test.cc | 11 +- src/builder_test.cc | 15 +- src/char_model_trainer_test.cc | 5 +- @@ -13,11 +12,11 @@ Subject: [PATCH 5/9] stop pretending sp glue code belongs in third_party/absl src/error.cc | 9 -- src/filesystem_test.cc | 5 +- {third_party/absl => src/glue}/flags/flag.cc | 22 +-- - {third_party/absl => src/glue}/flags/flag.h | 16 +-- + {third_party/absl => src/glue}/flags/flag.h | 18 +-- {third_party/absl => src/glue}/flags/parse.h | 10 +- .../absl => src/glue}/random/distributions.h | 4 +- .../absl => src/glue}/random/random.h | 6 +- - src/init.h | 11 +- + src/init.h | 15 +- src/init_test.cc | 61 ++++---- src/sentencepiece_processor_test.cc | 9 +- src/sentencepiece_trainer_test.cc | 27 ++-- @@ -25,24 +24,24 @@ Subject: [PATCH 5/9] stop pretending sp glue code belongs in third_party/absl src/spm_encode_main.cc | 74 +++++----- src/spm_export_vocab_main.cc | 19 ++- src/spm_normalize_main.cc | 45 +++--- - src/spm_train_main.cc | 136 +++++++++--------- + src/spm_train_main.cc | 138 +++++++++--------- src/test_main.cc | 7 +- src/testharness.cc | 5 +- src/testharness.h | 8 +- src/trainer_interface.cc | 10 +- src/trainer_interface_test.cc | 7 +- - src/unigram_model_trainer_test.cc | 7 +- + src/unigram_model_trainer_test.cc | 11 +- src/util_test.cc | 5 +- src/word_model_trainer_test.cc | 5 +- - 30 files changed, 301 insertions(+), 295 deletions(-) - rename {third_party/absl => src/glue}/flags/flag.cc (93%) - rename {third_party/absl => src/glue}/flags/flag.h (78%) + 29 files changed, 304 insertions(+), 300 deletions(-) + rename {third_party/absl => src/glue}/flags/flag.cc (94%) + rename {third_party/absl => src/glue}/flags/flag.h (77%) rename {third_party/absl => src/glue}/flags/parse.h (79%) rename {third_party/absl => src/glue}/random/distributions.h (94%) rename {third_party/absl => src/glue}/random/random.h (91%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 944f726..6d986b6 100644 +index d30a4c2..c130e8b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,9 @@ @@ -56,28 +55,6 @@ index 944f726..6d986b6 100644 set(ABSL_STRINGS_SRCS "") list(APPEND SPM_LIBS absl::strings) list(APPEND SPM_LIBS absl::flags) -diff --git a/src/bpe_model_trainer.cc b/src/bpe_model_trainer.cc -index d3b9d50..b0073df 100644 ---- a/src/bpe_model_trainer.cc -+++ b/src/bpe_model_trainer.cc -@@ -12,6 +12,8 @@ - // See the License for the specific language governing permissions and - // limitations under the License.! - -+#include "bpe_model_trainer.h" -+ - #include - #include - #include -@@ -19,7 +21,7 @@ - - #include "absl/container/flat_hash_set.h" - --#include "bpe_model_trainer.h" -+#include "glue/flags/flag.h" - #include "util.h" - - namespace sentencepiece { diff --git a/src/bpe_model_trainer_test.cc b/src/bpe_model_trainer_test.cc index f977fd0..67b5b2d 100644 --- a/src/bpe_model_trainer_test.cc @@ -225,7 +202,7 @@ index b49cea1..7192b68 100644 auto output = filesystem::NewWritableFile(input_file); for (const auto &line : input) { diff --git a/src/compile_charsmap_main.cc b/src/compile_charsmap_main.cc -index 48da74a..a05c069 100644 +index 23e5ef8..88c217a 100644 --- a/src/compile_charsmap_main.cc +++ b/src/compile_charsmap_main.cc @@ -18,17 +18,17 @@ @@ -248,7 +225,7 @@ index 48da74a..a05c069 100644 "make normalization_rule.h file"); namespace sentencepiece { -@@ -186,7 +186,7 @@ int main(int argc, char **argv) { +@@ -187,7 +187,7 @@ int main(int argc, char **argv) { data.emplace_back(p.first, index); } @@ -258,11 +235,11 @@ index 48da74a..a05c069 100644 auto output = sentencepiece::filesystem::NewWritableFile(kPrecompiledHeaderFileName); diff --git a/src/error.cc b/src/error.cc -index 1cc77b8..f12c739 100644 +index 19ef6f3..e30f914 100644 --- a/src/error.cc +++ b/src/error.cc -@@ -17,15 +17,6 @@ - #include "common.h" +@@ -18,15 +18,6 @@ + #include "init.h" #include "sentencepiece_processor.h" -#ifdef _USE_EXTERNAL_ABSL @@ -308,10 +285,10 @@ index 3c9fbdc..305cd87 100644 for (size_t i = 0; i < kData.size(); ++i) { EXPECT_TRUE(input->ReadLine(&line)); diff --git a/third_party/absl/flags/flag.cc b/src/glue/flags/flag.cc -similarity index 93% +similarity index 94% rename from third_party/absl/flags/flag.cc rename to src/glue/flags/flag.cc -index 8e99c0d..ceaa595 100644 +index 5d6642a..06aeb7e 100644 --- a/third_party/absl/flags/flag.cc +++ b/src/glue/flags/flag.cc @@ -12,25 +12,25 @@ @@ -348,7 +325,7 @@ index 8e99c0d..ceaa595 100644 namespace internal { namespace { template -@@ -209,14 +209,14 @@ std::vector ParseCommandLine(int argc, char *argv[]) { +@@ -209,10 +209,10 @@ std::vector ParseCommandLine(int argc, char *argv[]) { } } @@ -361,16 +338,17 @@ index 8e99c0d..ceaa595 100644 std::cout << PACKAGE_STRING << " " << VERSION << std::endl; sentencepiece::error::Exit(0); } - - return output_args; +@@ -229,4 +229,4 @@ void CleanupFlags() { + } } + -} // namespace absl +} // namespace sentencepiece diff --git a/third_party/absl/flags/flag.h b/src/glue/flags/flag.h -similarity index 78% +similarity index 77% rename from third_party/absl/flags/flag.h rename to src/glue/flags/flag.h -index e540edf..c8a7dab 100644 +index c522358..ba439fe 100644 --- a/third_party/absl/flags/flag.h +++ b/src/glue/flags/flag.h @@ -12,15 +12,15 @@ @@ -392,10 +370,15 @@ index e540edf..c8a7dab 100644 namespace internal { struct FlagFunc; -@@ -52,11 +52,11 @@ void SetFlag(Flag *flag, const V &v) { - const T value(v); +@@ -54,15 +54,15 @@ void SetFlag(Flag *flag, const V &v) { flag->set_value(value); } + +-#define HAS_ABSL_CLEANUP_FLAGS ++#define HAS_STPC_CLEANUP_FLAGS + + void CleanupFlags(); + -} // namespace absl +} // namespace sentencepiece @@ -488,10 +471,10 @@ index 3c3a21e..ac20d33 100644 #endif // ABSL_CONTAINER_RANDOM_H_ diff --git a/src/init.h b/src/init.h -index 8a557f2..d9c40e0 100644 +index 1f4c292..ec4e4be 100644 --- a/src/init.h +++ b/src/init.h -@@ -15,17 +15,16 @@ +@@ -15,10 +15,9 @@ #ifndef INIT_H_ #define INIT_H_ @@ -502,6 +485,12 @@ index 8a557f2..d9c40e0 100644 +#include "glue/flags/flag.h" +#include "glue/flags/parse.h" + #ifdef _USE_EXTERNAL_PROTOBUF + #include "google/protobuf/message_lite.h" +@@ -26,12 +25,12 @@ + #include "third_party/protobuf-lite/google/protobuf/message_lite.h" + #endif + -ABSL_DECLARE_FLAG(int32, minloglevel); +STPC_DECLARE_FLAG(int32, minloglevel); @@ -513,14 +502,22 @@ index 8a557f2..d9c40e0 100644 if (remove_arg) { char **argv_val = *argv; -@@ -34,7 +33,7 @@ inline void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, +@@ -40,13 +39,13 @@ inline void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, *argc = static_cast(unused_args.size()); } - logging::SetMinLogLevel(absl::GetFlag(FLAGS_minloglevel)); + logging::SetMinLogLevel(sentencepiece::GetFlag(FLAGS_minloglevel)); } - } // namespace sentencepiece + + inline void ShutdownLibrary() { + google::protobuf::ShutdownProtobufLibrary(); +-#ifdef HAS_ABSL_CLEANUP_FLAGS +- absl::CleanupFlags(); ++#ifdef HAS_STPC_CLEANUP_FLAGS ++ sentencepiece::CleanupFlags(); + #endif + } diff --git a/src/init_test.cc b/src/init_test.cc index e5cd2e4..82381c2 100644 @@ -774,10 +771,10 @@ index 0bb5aab..1a241b0 100644 NormalizerSpec normalizer_spec; ASSERT_TRUE(SentencePieceTrainer::Train(trainer_spec, normalizer_spec).ok()); diff --git a/src/spm_decode_main.cc b/src/spm_decode_main.cc -index f3b8269..e66f7f4 100644 +index 32fbb76..a045823 100644 --- a/src/spm_decode_main.cc +++ b/src/spm_decode_main.cc -@@ -21,42 +21,43 @@ +@@ -21,17 +21,18 @@ #include "common.h" #include "filesystem.h" @@ -802,6 +799,7 @@ index f3b8269..e66f7f4 100644 "':' separated encoder extra options, e.g., \"reverse:bos:eos\""); int main(int argc, char *argv[]) { +@@ -39,25 +40,25 @@ int main(int argc, char *argv[]) { sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); std::vector rest_args; @@ -833,7 +831,7 @@ index f3b8269..e66f7f4 100644 CHECK_OK(output->status()); std::string detok, line; -@@ -72,36 +73,36 @@ int main(int argc, char *argv[]) { +@@ -73,36 +74,36 @@ int main(int argc, char *argv[]) { return ids; }; @@ -880,10 +878,10 @@ index f3b8269..e66f7f4 100644 for (const auto &filename : rest_args) { diff --git a/src/spm_encode_main.cc b/src/spm_encode_main.cc -index 2199aa4..4a4d19c 100644 +index 02def40..b22cc7a 100644 --- a/src/spm_encode_main.cc +++ b/src/spm_encode_main.cc -@@ -17,72 +17,72 @@ +@@ -17,38 +17,38 @@ #include #include "absl/container/flat_hash_map.h" @@ -934,6 +932,7 @@ index 2199aa4..4a4d19c 100644 "Generates vocabulary file instead of segmentation"); int main(int argc, char *argv[]) { +@@ -56,34 +56,34 @@ int main(int argc, char *argv[]) { sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); std::vector rest_args; @@ -979,7 +978,7 @@ index 2199aa4..4a4d19c 100644 CHECK_OK(output->status()); std::string line; -@@ -95,10 +95,10 @@ int main(int argc, char *argv[]) { +@@ -96,10 +96,10 @@ int main(int argc, char *argv[]) { sentencepiece::NBestSentencePieceText nbest_spt; std::function process; @@ -993,7 +992,7 @@ index 2199aa4..4a4d19c 100644 process = [&](absl::string_view line) { CHECK_OK(sp.Encode(line, &spt)); for (const auto &piece : spt.pieces()) { -@@ -106,53 +106,53 @@ int main(int argc, char *argv[]) { +@@ -107,53 +107,53 @@ int main(int argc, char *argv[]) { vocab[piece.piece()]++; } }; @@ -1057,7 +1056,7 @@ index 2199aa4..4a4d19c 100644 } for (const auto &filename : rest_args) { -@@ -163,7 +163,7 @@ int main(int argc, char *argv[]) { +@@ -164,7 +164,7 @@ int main(int argc, char *argv[]) { } } @@ -1067,10 +1066,10 @@ index 2199aa4..4a4d19c 100644 output->WriteLine(it.first + "\t" + sentencepiece::string_util::SimpleItoa(it.second)); diff --git a/src/spm_export_vocab_main.cc b/src/spm_export_vocab_main.cc -index 1603195..585c143 100644 +index d0aea7d..c161662 100644 --- a/src/spm_export_vocab_main.cc +++ b/src/spm_export_vocab_main.cc -@@ -15,17 +15,16 @@ +@@ -14,17 +14,16 @@ #include @@ -1126,7 +1125,7 @@ index 1603195..585c143 100644 return 0; diff --git a/src/spm_normalize_main.cc b/src/spm_normalize_main.cc -index dca66b1..a61e7d4 100644 +index 7352f63..f6fe88a 100644 --- a/src/spm_normalize_main.cc +++ b/src/spm_normalize_main.cc @@ -12,11 +12,10 @@ @@ -1171,7 +1170,7 @@ index dca66b1..a61e7d4 100644 using sentencepiece::ModelProto; using sentencepiece::NormalizerSpec; -@@ -50,27 +49,27 @@ int main(int argc, char *argv[]) { +@@ -51,27 +50,27 @@ int main(int argc, char *argv[]) { sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); std::vector rest_args; @@ -1207,7 +1206,7 @@ index dca66b1..a61e7d4 100644 CHECK_OK(SentencePieceTrainer::PopulateNormalizerSpec(&spec)); } else { LOG(FATAL) << "Sets --model, normalization_rule_tsv, or " -@@ -78,22 +77,22 @@ int main(int argc, char *argv[]) { +@@ -79,22 +78,22 @@ int main(int argc, char *argv[]) { } // Uses the normalizer spec encoded in the model_pb. @@ -1236,7 +1235,7 @@ index dca66b1..a61e7d4 100644 if (rest_args.empty()) { diff --git a/src/spm_train_main.cc b/src/spm_train_main.cc -index 84131cf..742e8bd 100644 +index 08c519f..1e6d8a9 100644 --- a/src/spm_train_main.cc +++ b/src/spm_train_main.cc @@ -14,12 +14,12 @@ @@ -1253,7 +1252,7 @@ index 84131cf..742e8bd 100644 #include "init.h" #include "sentencepiece_model.pb.h" #include "sentencepiece_trainer.h" -@@ -33,127 +33,127 @@ static sentencepiece::TrainerSpec kDefaultTrainerSpec; +@@ -33,130 +33,130 @@ static sentencepiece::TrainerSpec kDefaultTrainerSpec; static sentencepiece::NormalizerSpec kDefaultNormalizerSpec; } // namespace @@ -1322,6 +1321,10 @@ index 84131cf..742e8bd 100644 -ABSL_FLAG(bool, split_digits, kDefaultTrainerSpec.split_digits(), +STPC_FLAG(bool, split_digits, kDefaultTrainerSpec.split_digits(), "split all digits (0-9) into separate pieces"); +-ABSL_FLAG(std::string, pretokenization_delimiter, ++STPC_FLAG(std::string, pretokenization_delimiter, + kDefaultTrainerSpec.pretokenization_delimiter(), + "specifies the delimiter of pre-tokenization"); -ABSL_FLAG(bool, treat_whitespace_as_suffix, +STPC_FLAG(bool, treat_whitespace_as_suffix, kDefaultTrainerSpec.treat_whitespace_as_suffix(), @@ -1432,7 +1435,7 @@ index 84131cf..742e8bd 100644 "Threshold for" " clipping the counts for DP"); -@@ -164,11 +164,11 @@ int main(int argc, char *argv[]) { +@@ -168,11 +168,11 @@ int main(int argc, char *argv[]) { sentencepiece::NormalizerSpec normalizer_spec; NormalizerSpec denormalizer_spec; @@ -1448,7 +1451,7 @@ index 84131cf..742e8bd 100644 } auto load_lines = [](absl::string_view filename) { -@@ -182,28 +182,28 @@ int main(int argc, char *argv[]) { +@@ -186,28 +186,28 @@ int main(int argc, char *argv[]) { // Populates the value from flags to spec. #define SetTrainerSpecFromFlag(name) \ @@ -1485,7 +1488,7 @@ index 84131cf..742e8bd 100644 trainer_spec.add_##name(v); \ } \ } -@@ -256,21 +256,21 @@ int main(int argc, char *argv[]) { +@@ -261,21 +261,21 @@ int main(int argc, char *argv[]) { SetRepeatedTrainerSpecFromFile(control_symbols); SetRepeatedTrainerSpecFromFile(user_defined_symbols); @@ -1512,7 +1515,7 @@ index 84131cf..742e8bd 100644 CHECK_OK(sentencepiece::SentencePieceTrainer::Train( trainer_spec, normalizer_spec, denormalizer_spec)); diff --git a/src/test_main.cc b/src/test_main.cc -index b3170e2..cc41c6e 100644 +index 38c978d..f9cf694 100644 --- a/src/test_main.cc +++ b/src/test_main.cc @@ -12,16 +12,17 @@ @@ -1535,7 +1538,7 @@ index b3170e2..cc41c6e 100644 +STPC_FLAG(std::string, test_tmpdir, "test_tmp", "Temporary directory."); int main(int argc, char **argv) { - sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); + sentencepiece::ScopedResourceDestructor cleaner; diff --git a/src/testharness.cc b/src/testharness.cc index 6769cd8..d9a84d2 100644 --- a/src/testharness.cc @@ -1584,7 +1587,7 @@ index 8de9c5c..022325d 100644 namespace sentencepiece { namespace test { diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc -index 8d30703..1e3443f 100644 +index 27cec4f..e9e1cd2 100644 --- a/src/trainer_interface.cc +++ b/src/trainer_interface.cc @@ -24,8 +24,6 @@ @@ -1605,7 +1608,7 @@ index 8d30703..1e3443f 100644 #include "model_factory.h" #include "model_interface.h" #include "normalizer.h" -@@ -303,10 +303,10 @@ bool TrainerInterface::IsValidSentencePiece( +@@ -304,10 +304,10 @@ bool TrainerInterface::IsValidSentencePiece( } template @@ -1618,7 +1621,7 @@ index 8d30703..1e3443f 100644 generator, 0, trainer_spec.differential_privacy_noise_level()); *to_update = -@@ -480,7 +480,7 @@ END: +@@ -481,7 +481,7 @@ END: for (int n = 0; n < num_workers; ++n) { pool->Schedule([&, n]() { // One per thread generator. @@ -1628,18 +1631,18 @@ index 8d30703..1e3443f 100644 AddDPNoise(trainer_spec_, generator, &(sentences_[i].second)); diff --git a/src/trainer_interface_test.cc b/src/trainer_interface_test.cc -index 6fea564..e3e452a 100644 +index 75e9f54..ca4d4e7 100644 --- a/src/trainer_interface_test.cc +++ b/src/trainer_interface_test.cc -@@ -18,6 +18,7 @@ +@@ -20,6 +20,7 @@ #include "absl/strings/str_format.h" #include "filesystem.h" +#include "glue/flags/flag.h" #include "testharness.h" - #include "trainer_interface.h" #include "util.h" -@@ -473,7 +474,7 @@ TEST(TrainerInterfaceTest, SerializeTest) { + +@@ -494,7 +495,7 @@ TEST(TrainerInterfaceTest, SerializeTest) { TEST(TrainerInterfaceTest, CharactersTest) { const std::string input_file = @@ -1648,7 +1651,7 @@ index 6fea564..e3e452a 100644 { auto output = filesystem::NewWritableFile(input_file); // Make a single line with 50 "a", 49 "あ", and 1 "b". -@@ -539,7 +540,7 @@ TEST(TrainerInterfaceTest, MultiFileSentenceIteratorTest) { +@@ -560,7 +561,7 @@ TEST(TrainerInterfaceTest, MultiFileSentenceIteratorTest) { std::vector files; std::vector expected; for (int i = 0; i < 10; ++i) { @@ -1657,7 +1660,7 @@ index 6fea564..e3e452a 100644 absl::StrCat("input", i)); auto output = filesystem::NewWritableFile(file); int num_line = (rand() % 100) + 1; -@@ -561,7 +562,7 @@ TEST(TrainerInterfaceTest, MultiFileSentenceIteratorTest) { +@@ -582,7 +583,7 @@ TEST(TrainerInterfaceTest, MultiFileSentenceIteratorTest) { TEST(TrainerInterfaceTest, MultiFileSentenceIteratorErrorTest) { std::vector files; for (int i = 0; i < 10; ++i) { @@ -1667,7 +1670,7 @@ index 6fea564..e3e452a 100644 files.push_back(file); } diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc -index 4b336ec..9b8ddb7 100644 +index ab887b6..f1c465b 100644 --- a/src/unigram_model_trainer_test.cc +++ b/src/unigram_model_trainer_test.cc @@ -21,6 +21,7 @@ @@ -1678,7 +1681,19 @@ index 4b336ec..9b8ddb7 100644 #include "sentencepiece_model.pb.h" #include "sentencepiece_processor.h" #include "sentencepiece_trainer.h" -@@ -154,13 +155,13 @@ static constexpr char kTestInputData[] = "wagahaiwa_nekodearu.txt"; +@@ -49,9 +50,9 @@ TrainerResult RunTrainer(const std::vector& input, int size, + const bool use_dp = false, const float dp_noise = 0.0, + const uint32 dp_clip = 0) { + const std::string input_file = +- util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "input"); ++ util::JoinPath(sentencepiece::GetFlag(FLAGS_test_tmpdir), "input"); + const std::string model_prefix = +- util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "model"); ++ util::JoinPath(sentencepiece::GetFlag(FLAGS_test_tmpdir), "model"); + { + auto output = filesystem::NewWritableFile(input_file); + for (const auto& line : input) { +@@ -156,13 +157,13 @@ static constexpr char kTestInputData[] = "wagahaiwa_nekodearu.txt"; TEST(UnigramTrainerTest, EndToEndTest) { const std::string input = @@ -1694,7 +1709,7 @@ index 4b336ec..9b8ddb7 100644 " --input=", input, " --vocab_size=8000 --normalization_rule_name=identity", " --model_type=unigram --user_defined_symbols=", -@@ -168,7 +169,7 @@ TEST(UnigramTrainerTest, EndToEndTest) { +@@ -170,7 +171,7 @@ TEST(UnigramTrainerTest, EndToEndTest) { .ok()); SentencePieceProcessor sp; @@ -1757,6 +1772,3 @@ index 8288027..ab55030 100644 { auto output = filesystem::NewWritableFile(input_file); for (const auto &line : input) { --- -2.38.1.windows.1 - diff --git a/recipe/patches/0006-point-to-our-libs-headers-for-windows-in-setup.py.patch b/recipe/patches/0006-point-to-our-libs-headers-for-windows-in-setup.py.patch index d6ace87..b699384 100644 --- a/recipe/patches/0006-point-to-our-libs-headers-for-windows-in-setup.py.patch +++ b/recipe/patches/0006-point-to-our-libs-headers-for-windows-in-setup.py.patch @@ -1,26 +1,25 @@ -From 34cac78c9f294920175936944ad58a76e8bab7c0 Mon Sep 17 00:00:00 2001 +From e884436083b6d5c2beee6bc341f0a55958715f01 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Dec 2022 01:09:03 +1100 -Subject: [PATCH 6/9] point to our libs / headers for windows in setup.py +Subject: [PATCH 6/8] point to our libs / headers for windows in setup.py -also do not risk building against bundled libs +also do not risk building against bundled libs, nor +setting /MT for the MSVC static runtime libs --- - python/setup.py | 20 ++++++++++++-------- - 1 file changed, 12 insertions(+), 8 deletions(-) + python/setup.py | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/python/setup.py b/python/setup.py -index fdf9394..ba4dda8 100755 +index 5411231..d8f0b5e 100755 --- a/python/setup.py +++ b/python/setup.py -@@ -77,12 +77,12 @@ class build_ext(_build_ext): +@@ -77,10 +77,11 @@ class build_ext(_build_ext): """Override build_extension to run cmake.""" def build_extension(self, ext): - cflags, libs = get_cflags_and_libs('../build/root') -- if len(libs) == 0: ++ # cflags, libs = get_cflags_and_libs('./bundled/root') + cflags = ['-std=c++17'] -+ if False: - cflags, libs = get_cflags_and_libs('./bundled/root') - if len(libs) == 0: - if is_sentencepiece_installed(): @@ -29,23 +28,24 @@ index fdf9394..ba4dda8 100755 cflags = cflags + run_pkg_config('cflags') libs = run_pkg_config('libs') else: -@@ -108,17 +108,21 @@ if os.name == 'nt': +@@ -106,17 +107,21 @@ if os.name == 'nt': arch = 'win32' if sys.maxsize > 2**32: arch = 'amd64' - if os.path.exists('..\\build\\root_{}\\lib'.format(arch)): + if False: - cflags = ['/std:c++17', '/MT', '/I..\\build\\root_{}\\include'.format(arch)] + cflags = ['/std:c++17', '/I..\\build\\root_{}\\include'.format(arch)] libs = [ '..\\build\\root_{}\\lib\\sentencepiece.lib'.format(arch), - '..\\build\\root_{}\\lib\\sentencepiece_train.lib'.format(arch) + '..\\build\\root_{}\\lib\\sentencepiece_train.lib'.format(arch), ] - else: -- cflags = ['/std:c++17', '/MT', '/I..\\build\\root\\include'] -+ cflags = ['/std:c++17', '/MT', '/I' + os.environ["LIBRARY_INC"]] +- elif os.path.exists('..\\build\\root\\lib'): +- cflags = ['/std:c++17', '/I..\\build\\root\\include'] ++ elif True: ++ cflags = ['/std:c++17', '/MD', '/I' + os.environ["LIBRARY_INC"]] libs = [ - '..\\build\\root\\lib\\sentencepiece.lib', -- '..\\build\\root\\lib\\sentencepiece_train.lib' +- '..\\build\\root\\lib\\sentencepiece_train.lib', + # equivalent of -L$PREFIX/lib -lsentencepiece -lsentencepiece_train -lprotobuf + os.environ["LIBRARY_LIB"] + f"\\{x}.lib" + # protobuf actually has the lib-prefix in the name also on windows; @@ -53,8 +53,5 @@ index fdf9394..ba4dda8 100755 + # host dependencies for the link interface, i.e. also abseil + for x in ["sentencepiece", "sentencepiece_train", "libprotobuf", "abseil_dll"] ] - - SENTENCEPIECE_EXT = Extension( --- -2.38.1.windows.1 - + else: + # build library locally with cmake and vc++. diff --git a/recipe/patches/0008-also-install-pkg-config-files-on-windows.patch b/recipe/patches/0007-also-install-pkg-config-files-on-windows.patch similarity index 72% rename from recipe/patches/0008-also-install-pkg-config-files-on-windows.patch rename to recipe/patches/0007-also-install-pkg-config-files-on-windows.patch index 10f06c0..06dba0e 100644 --- a/recipe/patches/0008-also-install-pkg-config-files-on-windows.patch +++ b/recipe/patches/0007-also-install-pkg-config-files-on-windows.patch @@ -1,17 +1,17 @@ -From ae090b9b0fa029ae7ca5639318f6818989233a99 Mon Sep 17 00:00:00 2001 +From 8b8c5277ec74053eac438b6e4d006afa0138abed Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 12 Dec 2022 14:36:45 +1100 -Subject: [PATCH 8/9] also install pkg-config files on windows +Subject: [PATCH 7/8] also install pkg-config files on windows --- CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt -index d54b008..7e6196a 100644 +index 434530a..a741455 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -90,9 +90,7 @@ endif() +@@ -131,9 +131,7 @@ join_paths(includedir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY) @@ -22,6 +22,3 @@ index d54b008..7e6196a 100644 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) endif() --- -2.38.1.windows.1 - diff --git a/recipe/patches/0007-don-t-force-replace-MD-with-MT.patch b/recipe/patches/0007-don-t-force-replace-MD-with-MT.patch deleted file mode 100644 index ae42aaa..0000000 --- a/recipe/patches/0007-don-t-force-replace-MD-with-MT.patch +++ /dev/null @@ -1,41 +0,0 @@ -From dee971649f71e70ebea6fc4c46ac0341008409c4 Mon Sep 17 00:00:00 2001 -From: "H. Vetinari" -Date: Sun, 11 Dec 2022 21:40:56 +1100 -Subject: [PATCH 7/9] don't force-replace /MD with /MT - ---- - CMakeLists.txt | 4 ---- - python/setup.py | 2 +- - 2 files changed, 1 insertion(+), 5 deletions(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index f4f7548..d54b008 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -60,10 +60,6 @@ else() - endif() - - if (MSVC) -- string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) -- string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_MINSIZEREL ${CMAKE_CXX_FLAGS_MINSIZEREL}) -- string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) -- string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_RELWITHDEBINFO ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) - add_definitions("/wd4267 /wd4244 /wd4305 /Zc:strictStrings /utf-8") - endif() - -diff --git a/python/setup.py b/python/setup.py -index ba4dda8..024cef3 100755 ---- a/python/setup.py -+++ b/python/setup.py -@@ -115,7 +115,7 @@ if os.name == 'nt': - '..\\build\\root_{}\\lib\\sentencepiece_train.lib'.format(arch) - ] - else: -- cflags = ['/std:c++17', '/MT', '/I' + os.environ["LIBRARY_INC"]] -+ cflags = ['/std:c++17', '/MD', '/I' + os.environ["LIBRARY_INC"]] - libs = [ - # equivalent of -L$PREFIX/lib -lsentencepiece -lsentencepiece_train -lprotobuf - os.environ["LIBRARY_LIB"] + f"\\{x}.lib" --- -2.38.1.windows.1 - diff --git a/recipe/patches/0009-create-and-install-CMake-metadata.patch b/recipe/patches/0008-create-and-install-CMake-metadata.patch similarity index 79% rename from recipe/patches/0009-create-and-install-CMake-metadata.patch rename to recipe/patches/0008-create-and-install-CMake-metadata.patch index 65c6fc7..368374d 100644 --- a/recipe/patches/0009-create-and-install-CMake-metadata.patch +++ b/recipe/patches/0008-create-and-install-CMake-metadata.patch @@ -1,20 +1,20 @@ -From 6ff22d17396ad48a087c6193b84de4939e6bf0bb Mon Sep 17 00:00:00 2001 +From 858ec6bd17ad9cbd6d3bbdd08faa22c70d764433 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 18 Jan 2023 19:44:15 +1100 -Subject: [PATCH 9/9] create and install CMake metadata +Subject: [PATCH 8/8] create and install CMake metadata --- CMakeLists.txt | 10 ++++++++++ sentencepieceConfig.cmake.in | 8 ++++++++ - src/CMakeLists.txt | 16 ++++++++++++---- - 3 files changed, 30 insertions(+), 4 deletions(-) + src/CMakeLists.txt | 17 +++++++++++++---- + 3 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 sentencepieceConfig.cmake.in diff --git a/CMakeLists.txt b/CMakeLists.txt -index 7e6196a..00b7baf 100644 +index a741455..372f78f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -94,6 +94,16 @@ if (TRUE) +@@ -135,6 +135,16 @@ if (TRUE) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) endif() @@ -46,19 +46,19 @@ index 0000000..b4c0474 +# Targets +include(${CMAKE_CURRENT_LIST_DIR}/sentencepieceTargets.cmake) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt -index 6d986b6..b327f25 100644 +index c130e8b..2ea219e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt -@@ -221,7 +221,7 @@ if (SPM_ENABLE_SHARED) - (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "sh4")) - list(APPEND SPM_LIBS "atomic") - endif() +@@ -232,7 +232,7 @@ endif() + if (SPM_ENABLE_SHARED) + target_link_libraries(sentencepiece ${SPM_LIBS}) + target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) - set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train) + set(SPM_LIBTARGETS sentencepiece sentencepiece_train) set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0) set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) set_target_properties(sentencepiece_train PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) -@@ -235,7 +235,7 @@ if (SPM_ENABLE_SHARED) +@@ -246,7 +246,7 @@ if (SPM_ENABLE_SHARED) else() add_library(sentencepiece ALIAS sentencepiece-static) add_library(sentencepiece_train ALIAS sentencepiece_train-static) @@ -67,7 +67,7 @@ index 6d986b6..b327f25 100644 set_target_properties(sentencepiece-static PROPERTIES OUTPUT_NAME "sentencepiece") set_target_properties(sentencepiece_train-static PROPERTIES OUTPUT_NAME "sentencepiece_train") -@@ -282,13 +282,21 @@ if (SPM_ENABLE_NFKC_COMPILE) +@@ -293,7 +293,7 @@ if (SPM_ENABLE_NFKC_COMPILE) target_link_libraries(compile_charsmap sentencepiece sentencepiece_train) endif() @@ -75,11 +75,18 @@ index 6d986b6..b327f25 100644 +set(SPM_BINTARGETS spm_encode spm_decode spm_normalize spm_train spm_export_vocab) + if (CMAKE_SYSTEM_NAME STREQUAL "iOS") +@@ -303,12 +303,21 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS") + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + else() -install(TARGETS ${SPM_INSTALLTARGETS} +install(TARGETS ${SPM_BINTARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + endif() + +install(TARGETS ${SPM_LIBTARGETS} + EXPORT sentencepieceTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -88,9 +95,7 @@ index 6d986b6..b327f25 100644 +install(EXPORT sentencepieceTargets + FILE sentencepieceTargets.cmake + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/sentencepiece") ++ install(FILES sentencepiece_trainer.h sentencepiece_processor.h DESTINATION ${CMAKE_INSTALL_INCDIR}) if (NOT SPM_USE_BUILTIN_PROTOBUF) --- -2.38.1.windows.1 - From d323215920b604dcd089a1a0ea106b010cc2b2db Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 6 Jun 2023 17:45:55 +1100 Subject: [PATCH 3/3] use a different url-scheme for the sources --- recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index b04c84f..d7ef79a 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -5,7 +5,7 @@ package: version: {{ version }} source: - url: https://github.com/google/sentencepiece/archive/v{{ version }}.tar.gz + url: https://github.com/google/sentencepiece/archive/refs/tags/v{{ version }}.tar.gz sha256: 63617eaf56c7a3857597dcd8780461f57dd21381b56a27716ef7d7e02e14ced4 patches: # trying to build both static & shared build seems to break on OSX