diff --git a/.gitignore b/.gitignore index 5debbb6f..92e6ed3b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,6 @@ Makefile.in /aclocal.m4 /compile /config.guess -/config.h.in /config.sub /configure /configure.scan @@ -26,6 +25,11 @@ Makefile.in /autogen.sh /ltmain.sh +CMakeFiles +CMakeCache.txt +config.h +sentencepiece.pc + *.o *.lo *.a diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..fac088d5 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.! + +cmake_minimum_required(VERSION 3.1 FATAL_ERROR) +project(sentencepiece VERSION 0.1.2 LANGUAGES CXX) + +option(SPM_ENABLE_NFKC_COMPILE "Enables NFKC compile" OFF) +option(SPM_ENABLE_SHARED "Builds shared libaries in addition to static libraries." ON) +option(SPM_BUILD_TEST "Builds test binaries." OFF) +option(SPM_COVERAGE "Runs gcov to test coverage." OFF) +option(SPM_ENABLE_TENSORFLOW_SHARED "Makes a tensorflow compatible shared file." OFF) +option(SPM_ENABLE_TCMALLOC "Enable TCMalloc if available." ON) +option(SPM_TCMALLOC_STATIC "Link static library of TCMALLOC." OFF) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(prefix ${CMAKE_INSTALL_PREFIX}) +set(exec_prefix "\${prefix}") +set(libdir "\${exec_prefix}/lib") +set(includedir "\${prefix}/include") +set(GNUCXX_STD_SUPPORT_VERSION "4.3") + +if (APPLE) + set(CMAKE_MACOSX_RPATH ON) + set(CMAKE_SKIP_BUILD_RPATH FALSE) + set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir) + if ("${isSystemDir}" STREQUAL "-1") + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") + endif() +endif() + +if (NOT DEFINED CMAKE_INSTALL_BINDIR) + set(CMAKE_INSTALL_BINDIR bin) +endif() + +if (NOT DEFINED CMAKE_INSTALL_LIBDIR) + set(CMAKE_INSTALL_LIBDIR lib) +endif() + +if (NOT DEFINED CMAKE_INSTALL_LIBDIR) + set(CMAKE_INSTALL_LIBDIR lib) +endif() + +if (NOT DEFINED CMAKE_INSTALL_INCDIR) + set(CMAKE_INSTALL_INCDIR include) +endif() + +configure_file("${PROJECT_SOURCE_DIR}/config.h.in" "config.h") +configure_file("${PROJECT_SOURCE_DIR}/sentencepiece.pc.in" "sentencepiece.pc" @ONLY) + +if (NOT MSVC) + install(FILES "${CMAKE_BINARY_DIR}/sentencepiece.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) +endif() + +include_directories(${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) + +if (SPM_BUILD_TEST) + enable_testing() +endif() + +add_subdirectory(src) diff --git a/Makefile.am b/Makefile.am deleted file mode 100644 index 069c27ac..00000000 --- a/Makefile.am +++ /dev/null @@ -1,25 +0,0 @@ -AUTOMAKE_OPTIONS = foreign -SUBDIRS = src - -EXTRA_DIRS = m4 third_party data doc python -EXTRA_DIST = README.md LICENSE -ACLOCAL_AMFLAGS = -I third_party/m4 - -pkgconfigdir = @pkgconfigdir@ -pkgconfig_DATA = sentencepiece.pc - -dist-hook: - for subdir in $(EXTRA_DIRS); do \ - cp -rp $$subdir $(distdir); \ - rm -f $(distdir)/$$subdir/*~; \ - rm -f $(distdir)/$$subdir/*.{bak,orig}; \ - rm -rf $(distdir)/$$subdir/CVS; \ - rm -rf $(distdir)/$$subdir/.svn; \ - rm -rf $(distdir)/.svn; \ - rm -rf $(distdir)/*/.svn; \ - rm -rf $(distdir)/*/*/.svn; \ - rm -rf $(distdir)/$$subdir/*/CVS; \ - rm -rf $(distdir)/$$subdir/*/.svn; \ - rm -rf $(distdir)/$$subdir/*/.pb.cc; \ - find $(distdir) -name .svn | xargs rm -fr; \ - done diff --git a/autogen.sh b/autogen.sh deleted file mode 100755 index 1e6d2213..00000000 --- a/autogen.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh -# Copyright 2016 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http:#www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! - -echo "Running aclocal ..." -aclocal -I . -echo "Running autoheader..." -autoheader -echo "Running libtoolize .." -case `uname` in Darwin*) glibtoolize ;; - *) libtoolize ;; -esac -echo "Running automake ..." -automake --add-missing --copy -echo "Running autoconf ..." -autoconf diff --git a/config.h.in b/config.h.in new file mode 100644 index 00000000..0413fea7 --- /dev/null +++ b/config.h.in @@ -0,0 +1,9 @@ +#ifndef CONFIG_H_ +#define CONFIG_H_ + +#define VERSION "@PROJECT_VERSION@" +#define PACKAGE "@PROJECT_NAME@" +#define PACKAGE_STRING "@PROJECT_NAME@" + + +#endif // CONFIG_H_ diff --git a/configure.ac b/configure.ac deleted file mode 100644 index d010d486..00000000 --- a/configure.ac +++ /dev/null @@ -1,88 +0,0 @@ -# -*- Autoconf -*- -# Process this file with autoconf to produce a configure script. - -AC_PREREQ([2.69]) -AC_INIT([sentencepiece], [0.1.0], [taku@google.com]) -AM_INIT_AUTOMAKE() -AC_CONFIG_SRCDIR([src/normalizer.h]) -AC_CONFIG_HEADERS([config.h]) -AC_CONFIG_MACRO_DIR([third_party/m4]) - -# Checks for programs. -AC_LANG([C++]) -AC_PROG_LIBTOOL -AC_PROG_CXX -AC_PROG_CC - -CXXFLAGS="-std=c++11 -Wall -O3" - -PKG_CHECK_MODULES(PROTOBUF, protobuf >= 2.4.0) -AC_SUBST(PROTOBUF_LIBS) -AC_SUBST(PROTOBUF_CFLAGS) -AC_SUBST(PROTOBUF_VERSION) -CXXFLAGS="$CXXFLAGS $PROTOBUF_CFLAGS" -LIBS="$LIBS $PROTOBUF_LIBS" - -m4_include([third_party/m4/ax_check_icu.m4]) - -if test ! -n "$PROTOC"; then - AC_CHECK_PROG([PROTOC], [protoc], [protoc]) -fi -AS_IF([test "x${PROTOC}" == "x"], - [AC_MSG_ERROR([ProtoBuf compiler "protoc" not found. You can install them with "sudo apt-get install libprotobuf-c++ protobuf-compiler" ])]) - -# --enable-nfkc-compile flag. -AC_MSG_CHECKING([nfkc-compile option]) -AC_ARG_ENABLE([nfkc-compile], - [AS_HELP_STRING([--enable-nfkc-compile], [compile NFKC normalizer mapping])], - [], - [enable_nfkc_compile=no]) -AC_MSG_RESULT([$enable_nfkc_compile]) - -if test "${enable_nfkc_compile}" = "yes"; then - AX_CHECK_ICU([40], [], AC_MSG_ERROR([Library requirements (ICU) not met.])) - CXXFLAGS="$CXXFLAGS -DENABLE_NFKC_COMPILE" - LIBS="$LIBS $ICU_LIBS" -fi - -# --enable-gcov flag. -AC_MSG_CHECKING([gcov option]) -AC_ARG_ENABLE([gcov], - [AS_HELP_STRING([--enable-gcov], [generate enable-gcov files])], - [], - [enable_gcov=no]) -AC_MSG_RESULT([$enable_gcov]) - -if test "${enable_gcov}" = "yes"; then - CXXFLAGS="-std=c++11 -Wall -fPIC -fprofile-arcs -O0 -coverage" - LIBS="$LIBS -lgcov" -fi - -# pkgconfigdir -AC_ARG_WITH(pkgconfigdir, - AC_HELP_STRING([--with-pkgconfigdir], - [Use the specified pkgconfig dir (default is libdir/pkgconfig)]), - [pkgconfigdir=${withval}], - [pkgconfigdir='${libdir}/pkgconfig']) -AC_MSG_NOTICE([pkgconfig directory is ${pkgconfigdir}]) -pkgconfigcflags=$CFLAGS -pkgconfiglibs=$LIBS -AC_SUBST([pkgconfigdir]) -AC_SUBST([pkgconfigcflags]) -AC_SUBST([pkgconfiglibs]) - -# Checks for header files. -AC_CHECK_HEADERS([unistd.h]) - -# Checks for typedefs, structures, and compiler characteristics. -AC_TYPE_SIZE_T - -# Checks for library functions. -AC_FUNC_STRTOD -AC_CHECK_FUNCS([memchr memset]) - -AC_CONFIG_FILES([Makefile - src/Makefile - sentencepiece.pc]) - -AC_OUTPUT diff --git a/python/make_py_wheel.sh b/python/make_py_wheel.sh index 8842bc08..d103ecfd 100755 --- a/python/make_py_wheel.sh +++ b/python/make_py_wheel.sh @@ -1,5 +1,4 @@ #!/bin/bash - # Copyright 2018 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,6 +17,7 @@ set -e # exit immediately on error set -x # display all commands PROTOBUF_VERSION=3.6.0 +CMAKE_VERSION=3.12.0 run_docker() { cd `dirname $0` @@ -31,17 +31,15 @@ run_docker() { build() { TRG=$1 - rm -fr tmp - mkdir -p tmp - - # Installs necessary libraries under `tmp` sub directory. - cd tmp + rm -fr build + mkdir -p build + cd build - # Install libtool - curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz - tar zxfv libtool-2.4.6.tar.gz - cd libtool-2.4.6 - ./configure + # Install cmake + curl -L -O https://cmake.org/files/v3.12/cmake-${CMAKE_VERSION}.tar.gz + tar zxfv cmake-${CMAKE_VERSION}.tar.gz + cd cmake-${CMAKE_VERSION} + ./bootstrap make -j4 make install cd .. @@ -51,24 +49,16 @@ build() { tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz cd protobuf-${PROTOBUF_VERSION} ./configure --disable-shared --with-pic - make CXXFLAGS+="-std=c++11 -O3" \ - CFLAGS+="-std=c++11 -O3" -j4 + make CXXFLAGS+="-std=c++11 -O3" CFLAGS+="-std=c++11 -O3" -j4 make install || true - cd ../.. + cd .. # Install sentencepiece + cmake ../.. -DSPM_ENABLE_SHARED=OFF + make -j4 + make install cd .. - make distclean || true - ./autogen.sh - grep -v PKG_CHECK_MODULES configure > tmp - mv tmp -f configure - chmod +x configure - LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic - make CXXFLAGS+="-std=c++11 -O3" \ - CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4 - make install || true - cd python for i in /opt/python/* do $i/bin/python setup.py bdist @@ -86,8 +76,9 @@ build() { done mv -f wheelhouse/*${TRG}.whl . - cd .. && rm -fr tmp - cd .. && make distclean + + cd .. + rm -fr build } if [ "$1" = "native" ]; then diff --git a/sentencepiece.pc.in b/sentencepiece.pc.in index 8add641f..048548eb 100644 --- a/sentencepiece.pc.in +++ b/sentencepiece.pc.in @@ -3,8 +3,8 @@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ -Name: @PACKAGE@ +Name: @PROJECT_NAME@ Description: Unsupervised text tokenizer and detokenizer for Neural Network-based text generation. -Version: @VERSION@ -Libs: -L${libdir} -lsentencepiece -lsentencepiece_train @pkgconfiglibs@ +Version: @PROJECT_VERSION@ +Libs: -L${libdir} -lsentencepiece -lsentencepiece_train -lprotobuf @pkgconfiglibs@ Cflags: -I${includedir} @pkgconfigcflags@ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..59206d21 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,231 @@ +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.! + +find_package(Protobuf REQUIRED) +include_directories(${Protobuf_INCLUDE_DIRS}) +protobuf_generate_cpp(SPM_PROTO_SRCS SPM_PROTO_HDRS sentencepiece.proto) +protobuf_generate_cpp(SPM_MODEL_PROTO_SRCS SPM_MODEL_PROTO_HDRS sentencepiece_model.proto) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${PROTOBUF_INCLUDE_DIR}) + +set(SPM_SRCS + ${SPM_PROTO_HDRS} + ${SPM_PROTO_SRCS} + ${SPM_MODEL_PROTO_HDRS} + ${SPM_MODEL_PROTO_SRCS} + bpe_model.h + common.h + normalizer.h + util.h + flags.h + sentencepiece_processor.h + word_model.h + model_factory.h + char_model.h + model_interface.h + testharness.h + unigram_model.h + bpe_model.cc + char_model.cc + error.cc + flags.cc + model_factory.cc + model_interface.cc + normalizer.cc + sentencepiece_processor.cc + unigram_model.cc + util.cc + word_model.cc + ../third_party/absl/strings/string_view.cc) + +set(SPM_TRAIN_SRCS + ${SPM_PROTO_HDRS} + ${SPM_MODEL_PROTO_HDRS} + builder.h + normalization_rule.h + unicode_script.h + unicode_script_map.h + trainer_factory.h + trainer_interface.h + unigram_model_trainer.h + word_model_trainer.h + char_model_trainer.h + bpe_model_trainer.h + sentencepiece_trainer.h + builder.cc + unicode_script.cc + trainer_factory.cc + trainer_interface.cc + unigram_model_trainer.cc + word_model_trainer.cc + char_model_trainer.cc + bpe_model_trainer.cc + sentencepiece_trainer.cc) + +set(SPM_TEST_SRCS + ${SPM_PROTO_HDRS} + ${SPM_MODEL_PROTO_HDRS} + testharness.h + bpe_model_test.cc + bpe_model_trainer_test.cc + builder_test.cc + char_model_test.cc + char_model_trainer_test.cc + flags_test.cc + model_factory_test.cc + model_interface_test.cc + normalizer_test.cc + sentencepiece_processor_test.cc + sentencepiece_trainer_test.cc + test_main.cc + testharness.cc + trainer_factory_test.cc + trainer_interface_test.cc + unicode_script_test.cc + unigram_model_test.cc + unigram_model_trainer_test.cc + util_test.cc + word_model_test.cc + word_model_trainer_test.cc) + +find_package(Threads REQUIRED) + +set(SPM_LIBS ${PROTOBUF_LIBRARY} Threads::Threads) + +if (SPM_ENABLE_NFKC_COMPILE) + find_package(ICU 4.4 COMPONENTS i18n data uc REQUIRED) + include_directories(${ICU_INCLUDE_DIRS}) + add_definitions(-DENABLE_NFKC_COMPILE) + list(APPEND SPM_LIBS ICU::i18n ICU::data ICU::uc) +endif() + +if (SPM_ENABLE_TCMALLOC) + if (SPM_TCMALLOC_STATIC) + find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a) + else() + find_library(TCMALLOC_LIB NAMES tcmalloc_minimal) + endif() + if (TCMALLOC_LIB) + list(APPEND SPM_LIBS ${TCMALLOC_LIB}) + endif() +endif() + +if (SPM_ENABLE_SHARED) + add_library(sentencepiece SHARED ${SPM_SRCS}) + add_library(sentencepiece_train SHARED ${SPM_TRAIN_SRCS}) +endif() + +add_library(sentencepiece-static STATIC ${SPM_SRCS}) +add_library(sentencepiece_train-static STATIC ${SPM_TRAIN_SRCS}) + +target_link_libraries(sentencepiece-static INTERFACE ${SPM_LIBS}) +target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static ${SPM_LIBS}) + +if (SPM_ENABLE_SHARED) + target_link_libraries(sentencepiece ${SPM_LIBS}) + target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) + set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static) + set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.1.0) + if (MSVC) + set_target_properties(sentencepiece PROPERTIES IMPORT_SUFFIX "_import.lib") + set_target_properties(sentencepiece_train PROPERTIES IMPORT_SUFFIX "_import.lib") + elseif (MINGW) + set_target_properties(sentencepiece PROPERTIES IMPORT_SUFFIX ".dll.a") + set_target_properties(sentencepiece_train PROPERTIES IMPORT_SUFFIX ".dll.a") + endif() +else() + add_library(sentencepiece ALIAS sentencepiece-static) + add_library(sentencepiece_train ALIAS sentencepiece_train-static) + set(SPM_INSTALLTARGETS sentencepiece-static sentencepiece_train-static) +endif() + +set_target_properties(sentencepiece-static PROPERTIES OUTPUT_NAME "sentencepiece") +set_target_properties(sentencepiece_train-static PROPERTIES OUTPUT_NAME "sentencepiece_train") + +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if (SPM_COVERAGE) + set(CMAKE_CXX_FLAGS "-O0 -Wall -fPIC -coverage ${CMAKE_CXX_FLAGS}") + else() + set(CMAKE_CXX_FLAGS "-O3 -Wall -fPIC ${CMAKE_CXX_FLAGS}") + endif() + if (SPM_ENABLE_TENSORFLOW_SHARED) + add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + endif() + set_source_files_properties( + sentencepiece.pb.cc sentencepiece_model.pb.cc + PROPERTIES COMPILE_FLAGS "-Wno-misleading-indentation") + set_source_files_properties(${SPM_TEST_SRCS} + PROPERTIES COMPILE_FLAGS "-Wno-sign-compare") + if (SPM_ENABLE_SHARED) + set_property(TARGET sentencepiece APPEND_STRING PROPERTY COMPILE_FLAGS " -DPIC") + set_property(TARGET sentencepiece_train APPEND_STRING PROPERTY COMPILE_FLAGS " -DPIC") + endif() +endif() + +add_executable(spm_encode spm_encode_main.cc) +add_executable(spm_decode spm_decode_main.cc) +add_executable(spm_normalize spm_normalize_main.cc) +add_executable(spm_train spm_train_main.cc) +add_executable(spm_export_vocab spm_export_vocab_main.cc) + +target_link_libraries(spm_encode sentencepiece) +target_link_libraries(spm_decode sentencepiece) +target_link_libraries(spm_normalize sentencepiece sentencepiece_train) +target_link_libraries(spm_train sentencepiece sentencepiece_train) +target_link_libraries(spm_export_vocab sentencepiece) + +if (SPM_ENABLE_NFKC_COMPILE) + add_executable(compile_charsmap compile_charsmap_main.cc) + target_link_libraries(compile_charsmap sentencepiece) +endif() + +list(APPEND SPM_INSTALLTARGETS + spm_encode spm_decode spm_normalize spm_train spm_export_vocab) + +install(TARGETS ${SPM_INSTALLTARGETS} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) +install(FILES sentencepiece_trainer.h sentencepiece_processor.h + DESTINATION ${CMAKE_INSTALL_INCDIR}) + +if (SPM_BUILD_TEST OR SPM_COVERAGE) + enable_testing() + add_executable(spm_test test_main.cc ${SPM_TEST_SRCS}) + + if (SPM_COVERAGE) + target_link_libraries(spm_test sentencepiece sentencepiece_train "-lgcov") + else() + target_link_libraries(spm_test sentencepiece sentencepiece_train) + endif() + + set(MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=definite,possible --error-exitcode=1") + find_program(CTEST_MEMORYCHECK_COMMAND NAMES valgrind) + include(Dart) + + add_test(NAME sentencepiece_test + COMMAND $ --data_dir=${PROJECT_SOURCE_DIR}/data) +endif() + +if (SPM_COVERAGE) + add_custom_target(coverage + COMMAND mkdir -p coverage + COMMAND $ --data_dir=${PROJECT_SOURCE_DIR}/data + COMMAND lcov -c -d . -o coverage.info + COMMAND lcov --remove coverage.info "include*" "/c++" "_test*" "testharness*" "third_party*" ".pb.*" -o coverage.info + COMMAND mkdir -p lcov_html + COMMAND genhtml -o lcov_html coverage.info) + add_dependencies(coverage spm_test) +endif() diff --git a/src/Makefile.am b/src/Makefile.am deleted file mode 100644 index d815a597..00000000 --- a/src/Makefile.am +++ /dev/null @@ -1,108 +0,0 @@ -lib_LTLIBRARIES = libsentencepiece.la libsentencepiece_train.la - -AM_CXXFLAS = -I($srcdir) -AUTOMAKE_OPTIONS = subdir-objects - -libsentencepiece_la_SOURCES = \ - error.cc \ - flags.cc \ - sentencepiece_processor.cc \ - util.cc \ - normalizer.cc \ - unicode_script_map.h util.h \ - common.h \ - flags.h normalizer.h sentencepiece_processor.h \ - model_factory.h model_factory.cc \ - model_interface.h model_interface.cc \ - unigram_model.h unigram_model.cc \ - word_model.h word_model.cc \ - char_model.h char_model.cc \ - bpe_model.h bpe_model.cc \ - ../third_party/absl/strings/string_view.cc -include_HEADERS = sentencepiece_processor.h sentencepiece_trainer.h - -# noinst_LIBRARIES = libsentencepiecetrain.a -libsentencepiece_train_la_SOURCES = builder.cc builder.h \ - normalization_rule.h \ - unicode_script.h unicode_script.cc \ - trainer_factory.h trainer_factory.cc \ - trainer_interface.h trainer_interface.cc \ - unigram_model_trainer.h unigram_model_trainer.cc \ - word_model_trainer.h word_model_trainer.cc \ - char_model_trainer.h char_model_trainer.cc \ - bpe_model_trainer.h bpe_model_trainer.cc \ - sentencepiece_trainer.h sentencepiece_trainer.cc - -nodist_libsentencepiece_la_SOURCES = \ - sentencepiece.pb.cc sentencepiece.pb.h \ - sentencepiece_model.pb.cc sentencepiece_model.pb.h - -BUILT_SOURCES = \ - sentencepiece.pb.cc \ - sentencepiece_model.pb.cc - -EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto - -bin_PROGRAMS = spm_encode spm_decode spm_normalize spm_train spm_export_vocab -noinst_PROGRAMS = compile_charsmap - -spm_encode_SOURCES = spm_encode_main.cc -spm_encode_LDADD = libsentencepiece.la - -spm_decode_SOURCES = spm_decode_main.cc -spm_decode_LDADD = libsentencepiece.la - -spm_normalize_SOURCES = spm_normalize_main.cc -spm_normalize_LDADD = libsentencepiece_train.la libsentencepiece.la - -spm_export_vocab_SOURCES = spm_export_vocab_main.cc -spm_export_vocab_LDADD = libsentencepiece.la - -spm_train_SOURCES = spm_train_main.cc -spm_train_LDADD = libsentencepiece_train.la libsentencepiece.la - -compile_charsmap_SOURCES = compile_charsmap_main.cc -compile_charsmap_LDADD = libsentencepiece_train.la libsentencepiece.la - - -check_PROGRAMS = spm_test -TESTS = spm_test -spm_test_SOURCES = testharness.h \ - builder_test.cc \ - flags_test.cc \ - normalizer_test.cc \ - sentencepiece_processor_test.cc \ - sentencepiece_trainer_test.cc \ - unicode_script_test.cc \ - model_interface_test.cc \ - model_factory_test.cc \ - trainer_interface_test.cc \ - trainer_factory_test.cc \ - word_model_test.cc \ - word_model_trainer_test.cc \ - bpe_model_test.cc \ - bpe_model_trainer_test.cc \ - char_model_test.cc \ - char_model_trainer_test.cc \ - unigram_model_test.cc\ - unigram_model_trainer_test.cc \ - util_test.cc \ - test_main.cc \ - testharness.cc - -spm_test_LDADD = libsentencepiece_train.la libsentencepiece.la - -CLEANFILES = *.pb.cc *.pb.h *.pb.h *.gcda *.gcno *.info -clean-local: - -rm -rf lcov_html - -%.pb.cc %.pb.h: %.proto - $(PROTOC) --cpp_out=$(srcdir) $< - -coverage: - make clean - make -j10 CXXFLAGS+="-O0 -Wall -std=c++11 -coverage" LIBS+="-lgcov -lprotobuf" check - lcov -c -d . -o coverage.info - lcov --remove coverage.info "include*" "/c++" "_test*" "testharness*" "third_party*" ".pb.*" -o coverage.info - mkdir -p lcov_html - genhtml -o lcov_html coverage.info diff --git a/src/bpe_model_trainer_test.cc b/src/bpe_model_trainer_test.cc index 71d49bac..01e3864c 100644 --- a/src/bpe_model_trainer_test.cc +++ b/src/bpe_model_trainer_test.cc @@ -16,11 +16,14 @@ #include #include +#include "flags.h" #include "sentencepiece_processor.h" #include "sentencepiece_trainer.h" #include "testharness.h" #include "util.h" +DECLARE_string(data_dir); + namespace sentencepiece { namespace bpe { namespace { @@ -87,13 +90,14 @@ TEST(BPETrainerTest, BasicTest) { TEST(BPETrainerTest, EndToEndTest) { const test::ScopedTempFile sf("tmp_model"); - EXPECT_OK(SentencePieceTrainer::Train( - std::string("--model_prefix=") + sf.filename() + - " --input=../data/wagahaiwa_nekodearu.txt" - " --vocab_size=8000" - " --normalization_rule_name=identity" - " --model_type=bpe" - " --control_symbols=")); + EXPECT_OK(SentencePieceTrainer::Train(std::string("--model_prefix=") + + sf.filename() + + " --input=" + FLAGS_data_dir + + "/wagahaiwa_nekodearu.txt" + " --vocab_size=8000" + " --normalization_rule_name=identity" + " --model_type=bpe" + " --control_symbols=")); SentencePieceProcessor sp; EXPECT_OK(sp.Load(std::string(sf.filename()) + ".model")); diff --git a/src/builder.cc b/src/builder.cc index be5e45da..e42503d8 100644 --- a/src/builder.cc +++ b/src/builder.cc @@ -17,6 +17,8 @@ #include #include +#include "config.h" + #ifdef ENABLE_NFKC_COMPILE #include #include @@ -326,7 +328,8 @@ util::Status Builder::BuildNFKCMap(CharsMap *chars_map) { if (nfkc == nfkd) { continue; } - // Expand all possible sequences which are normalized into the same `nfkd`. + // Expand all possible sequences which are normalized into the same + // `nfkd`. for (const auto &nfkd_orig : ExpandUnnormalized(nfkd, norm2orig)) { if (nfkd_orig != nfkc) { nfkc_map[nfkd_orig] = nfkc; diff --git a/src/builder_test.cc b/src/builder_test.cc index 212d3d13..a3af4441 100644 --- a/src/builder_test.cc +++ b/src/builder_test.cc @@ -14,11 +14,14 @@ #include "builder.h" #include "common.h" +#include "flags.h" #include "normalizer.h" #include "sentencepiece_trainer.h" #include "testharness.h" #include "util.h" +DECLARE_string(data_dir); + namespace sentencepiece { namespace normalizer { @@ -135,7 +138,7 @@ TEST(BuilderTest, CompileCharsMap) { TEST(BuilderTest, LoadCharsMapTest) { Builder::CharsMap chars_map; - EXPECT_OK(Builder::LoadCharsMap("../data/nfkc.tsv", &chars_map)); + EXPECT_OK(Builder::LoadCharsMap(FLAGS_data_dir + "/nfkc.tsv", &chars_map)); std::string precompiled, expected; EXPECT_OK(Builder::CompileCharsMap(chars_map, &precompiled)); diff --git a/src/common.h b/src/common.h index 4516be9f..7e75bdac 100644 --- a/src/common.h +++ b/src/common.h @@ -24,9 +24,7 @@ #include #include -#ifdef HAVE_CONFIG_H #include "config.h" -#endif #if defined(_WIN32) && !defined(__CYGWIN__) #define OS_WIN diff --git a/src/flags.cc b/src/flags.cc index c33e035e..830c2cb8 100644 --- a/src/flags.cc +++ b/src/flags.cc @@ -23,11 +23,8 @@ #include #include "common.h" -#include "util.h" - -#ifdef HAVE_CONFIG_H #include "config.h" -#endif +#include "util.h" namespace sentencepiece { namespace flags { diff --git a/src/sentencepiece_trainer_test.cc b/src/sentencepiece_trainer_test.cc index 0c2107de..ead34c19 100644 --- a/src/sentencepiece_trainer_test.cc +++ b/src/sentencepiece_trainer_test.cc @@ -13,31 +13,36 @@ // limitations under the License.! #include "sentencepiece_trainer.h" +#include "flags.h" #include "sentencepiece_model.pb.h" #include "testharness.h" #include "util.h" +DECLARE_string(data_dir); + namespace sentencepiece { namespace { TEST(SentencePieceTrainerTest, TrainFromArgsTest) { - SentencePieceTrainer::Train( - "--input=../data/botchan.txt --model_prefix=m --vocab_size=1000"); - SentencePieceTrainer::Train( - "--input=../data/botchan.txt --model_prefix=m --vocab_size=1000 " - "--model_type=bpe"); - SentencePieceTrainer::Train( - "--input=../data/botchan.txt --model_prefix=m --vocab_size=1000 " - "--model_type=char"); - SentencePieceTrainer::Train( - "--input=../data/botchan.txt --model_prefix=m --vocab_size=1000 " - "--model_type=word"); + std::string input = FLAGS_data_dir + "/botchan.txt"; + SentencePieceTrainer::Train(std::string("--input=") + input + + " --model_prefix=m --vocab_size=1000"); + SentencePieceTrainer::Train(std::string("--input=") + input + + " --model_prefix=m --vocab_size=1000 " + "--model_type=bpe"); + SentencePieceTrainer::Train(std::string("--input=") + input + + " --model_prefix=m --vocab_size=1000 " + "--model_type=char"); + SentencePieceTrainer::Train(std::string("--input=") + input + + " --model_prefix=m --vocab_size=1000 " + "--model_type=word"); } TEST(SentencePieceTrainerTest, TrainWithCustomNormalizationRule) { - SentencePieceTrainer::Train( - "--input=../data/botchan.txt --model_prefix=m --vocab_size=1000 " - "--normalization_rule_tsv=../data/nfkc.tsv"); + SentencePieceTrainer::Train("--input=" + FLAGS_data_dir + + "/botchan.txt --model_prefix=m --vocab_size=1000 " + "--normalization_rule_tsv=" + + FLAGS_data_dir + "/nfkc.tsv"); } TEST(SentencePieceTrainerTest, TrainErrorTest) { @@ -50,7 +55,7 @@ TEST(SentencePieceTrainerTest, TrainErrorTest) { TEST(SentencePieceTrainerTest, TrainTest) { TrainerSpec trainer_spec; - trainer_spec.add_input("../data/botchan.txt"); + trainer_spec.add_input(FLAGS_data_dir + "/botchan.txt"); trainer_spec.set_model_prefix("m"); trainer_spec.set_vocab_size(1000); NormalizerSpec normalizer_spec; diff --git a/src/test_main.cc b/src/test_main.cc index 000d0134..9ec2b3f3 100644 --- a/src/test_main.cc +++ b/src/test_main.cc @@ -12,9 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License.! +#include "flags.h" #include "testharness.h" +DEFINE_string(data_dir, "../data", "Data directory"); + int main(int argc, char **argv) { + std::vector rest_args; + sentencepiece::flags::ParseCommandLineFlags(argc, argv, &rest_args); + sentencepiece::test::RunAllTests(); return 0; } diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc index aa604275..c7164fa0 100644 --- a/src/unigram_model_trainer_test.cc +++ b/src/unigram_model_trainer_test.cc @@ -13,12 +13,16 @@ // limitations under the License.! #include "unigram_model_trainer.h" + +#include "flags.h" #include "sentencepiece_model.pb.h" #include "sentencepiece_processor.h" #include "sentencepiece_trainer.h" #include "testharness.h" #include "util.h" +DECLARE_string(data_dir); + namespace sentencepiece { namespace unigram { namespace { @@ -38,7 +42,8 @@ TEST(UnigramTrainerTest, EndToEndTest) { EXPECT_OK(SentencePieceTrainer::Train( std::string("--model_prefix=") + sf.filename() + - " --input=../data/wagahaiwa_nekodearu.txt" + " --input=" + FLAGS_data_dir + + "/wagahaiwa_nekodearu.txt" " --vocab_size=8000" " --normalization_rule_name=identity" " --model_type=unigram" diff --git a/tensorflow/make_py_wheel.sh b/tensorflow/make_py_wheel.sh index 7e61cb2e..044144d5 100755 --- a/tensorflow/make_py_wheel.sh +++ b/tensorflow/make_py_wheel.sh @@ -18,6 +18,7 @@ set -e # exit immediately on error set -x # display all commands PROTOBUF_VERSION=3.6.0 +CMAKE_VERSION=3.12.0 run_docker() { cd `dirname $0` @@ -30,19 +31,18 @@ run_docker() { } build() { - rm -fr tmp - mkdir -p tmp - + TRG=$1 + rm -fr build + mkdir -p build + cd build + export PATH="/opt/python/cp27-cp27mu/bin:${PATH}" - # Installs necessary libraries under `tmp` sub directory. - cd tmp - - # Install libtool - curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz - tar zxfv libtool-2.4.6.tar.gz - cd libtool-2.4.6 - ./configure + # Install cmake + curl -L -O https://cmake.org/files/v3.12/cmake-${CMAKE_VERSION}.tar.gz + tar zxfv cmake-${CMAKE_VERSION}.tar.gz + cd cmake-${CMAKE_VERSION} + ./bootstrap make -j4 make install cd .. @@ -53,30 +53,23 @@ build() { cd protobuf-${PROTOBUF_VERSION} ./configure --disable-shared --with-pic make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \ - CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4 - make install || true - cd ../.. + CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4 + make install + cd .. # Install sentencepiece + cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON + make -j4 + make install cd .. - make distclean || true - ./autogen.sh - grep -v PKG_CHECK_MODULES configure > tmp - mv tmp -f configure - chmod +x configure - LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic - make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \ - CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4 - make install || true # Builds _sentencepiece_processor_ops.so - cd tensorflow pip install tensorflow TF_CFLAGS="-I/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow/include" TF_LFLAGS="-L/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow -ltensorflow_framework" g++ -std=c++11 -shared \ - -I../src \ + -I../../src \ -fPIC ${TF_CFLAGS[@]} -O2 \ -D_GLIBCXX_USE_CXX11_ABI=0 \ -Wl,--whole-archive \ @@ -92,8 +85,7 @@ build() { python setup.py bdist_wheel --universal --plat-name=manylinux1_x86_64 python setup.py sdist - rm -fr build tf_sentencepiece.egg-info tmp - cd .. && make distclean + rm -fr build tf_sentencepiece.egg-info } if [ "$1" = "native" ]; then diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so index 1e54b802..5f420dcc 100755 Binary files a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so and b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so differ diff --git a/test.sh b/test.sh index ff6ee78c..c0b4eece 100755 --- a/test.sh +++ b/test.sh @@ -19,7 +19,7 @@ set -x # display all commands setup_ubuntu() { apt-get update - apt-get install -y build-essential autoconf automake libtool git \ + apt-get install -y build-essential cmake git \ pkg-config libprotobuf-c++ protobuf-compiler libprotobuf-dev python-pip python3-pip } @@ -29,18 +29,22 @@ setup_debian() { setup_fedora() { dnf update -y - dnf install -y rpm-build gcc-c++ make protobuf-devel autoconf automake libtool pkg-config python-pip python-devel + dnf install -y rpm-build gcc-c++ make protobuf-devel cmake pkg-config python-pip python-devel } build_generic() { - ./autogen.sh - ./configure + mkdir -p build + cd build + cmake .. -DSPM_BUILD_TEST=ON make -j2 - make check -j2 + make test + cd .. } build_python() { + cd build make install + cd .. export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig ldconfig -v @@ -61,15 +65,19 @@ build_tensorflow() { build_linux_gcc_coverall_ubuntu() { setup_debian + apt-get install -y lcov pip install cpp-coveralls pip install 'requests[security]' build_generic build_python build_tensorflow - make distclean - ./configure --enable-gcov - make check -j2 + mkdir -p build + cd build + cmake .. -DSPM_COVERAGE=ON + make -j2 + make coverage coveralls --exclude-pattern '.*(include|usr|test|third_party|pb|_main).*' --gcov-options '\-lp' --gcov gcov + cd .. } build_linux_gcc_ubuntu() { @@ -106,8 +114,8 @@ build_linux_clang_ubuntu() { apt-get install -y clang-${v} export CXX="clang++-${v}" CC="clang-${v}" build_generic - make distclean - done + rm -fr build + done } build_osx() { @@ -115,7 +123,9 @@ build_osx() { brew install protobuf || brew link --overwrite protobuf brew link --overwrite python@2 build_generic + cd build make install + cd .. cd python # Test default Python python setup.py test diff --git a/third_party/m4/LICENSE b/third_party/m4/LICENSE deleted file mode 100644 index a14c56b3..00000000 --- a/third_party/m4/LICENSE +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) 2008 Akos Maroy -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice -# and this notice are preserved. This file is offered as-is, without any -# warranty. diff --git a/third_party/m4/ax_check_icu.m4 b/third_party/m4/ax_check_icu.m4 deleted file mode 100644 index f30de163..00000000 --- a/third_party/m4/ax_check_icu.m4 +++ /dev/null @@ -1,80 +0,0 @@ -# =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_check_icu.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_CHECK_ICU(version, action-if, action-if-not) -# -# DESCRIPTION -# -# Defines ICU_LIBS, ICU_CFLAGS, ICU_CXXFLAGS. See icu-config(1) man page. -# -# LICENSE -# -# Copyright (c) 2008 Akos Maroy -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice -# and this notice are preserved. This file is offered as-is, without any -# warranty. - -#serial 6 - -AU_ALIAS([AC_CHECK_ICU], [AX_CHECK_ICU]) -AC_DEFUN([AX_CHECK_ICU], [ - succeeded=no - - if test -z "$ICU_CONFIG"; then - AC_PATH_PROG(ICU_CONFIG, icu-config, no) - fi - - if test "$ICU_CONFIG" = "no" ; then - echo "*** The icu-config script could not be found. Make sure it is" - echo "*** in your path, and that taglib is properly installed." - echo "*** Or see http://ibm.com/software/globalization/icu/" - else - ICU_VERSION=`$ICU_CONFIG --version` - AC_MSG_CHECKING(for ICU >= $1) - VERSION_CHECK=`expr $ICU_VERSION \>\= $1` - if test "$VERSION_CHECK" = "1" ; then - AC_MSG_RESULT(yes) - succeeded=yes - - AC_MSG_CHECKING(ICU_CPPFLAGS) - ICU_CPPFLAGS=`$ICU_CONFIG --cppflags` - AC_MSG_RESULT($ICU_CPPFLAGS) - - AC_MSG_CHECKING(ICU_CFLAGS) - ICU_CFLAGS=`$ICU_CONFIG --cflags` - AC_MSG_RESULT($ICU_CFLAGS) - - AC_MSG_CHECKING(ICU_CXXFLAGS) - ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags` - AC_MSG_RESULT($ICU_CXXFLAGS) - - AC_MSG_CHECKING(ICU_LIBS) - ICU_LIBS=`$ICU_CONFIG --ldflags` - AC_MSG_RESULT($ICU_LIBS) - else - ICU_CPPFLAGS="" - ICU_CFLAGS="" - ICU_CXXFLAGS="" - ICU_LIBS="" - ## If we have a custom action on failure, don't print errors, but - ## do set a variable so people can do so. - ifelse([$3], ,echo "can't find ICU >= $1",) - fi - - AC_SUBST(ICU_CPPFLAGS) - AC_SUBST(ICU_CFLAGS) - AC_SUBST(ICU_CXXFLAGS) - AC_SUBST(ICU_LIBS) - fi - - if test $succeeded = yes; then - ifelse([$2], , :, [$2]) - else - ifelse([$3], , AC_MSG_ERROR([Library requirements (ICU) not met.]), [$3]) - fi -])