-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Support text normalization * Release v1.7.7 * Fix typos * minor fixes * Add ccache to CI
- Loading branch information
1 parent
6c2035f
commit 0872dbc
Showing
16 changed files
with
221 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,11 @@ jobs: | |
with: | ||
fetch-depth: 0 | ||
|
||
- name: ccache | ||
uses: hendrikmuhs/[email protected] | ||
with: | ||
key: ${{ matrix.os }}-${{ matrix.python-version }} | ||
|
||
- name: Setup Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
|
@@ -58,6 +63,10 @@ jobs: | |
- name: Build doc | ||
shell: bash | ||
run: | | ||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache | ||
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | ||
cmake --version | ||
cd docs | ||
python3 -m pip install -r ./requirements.txt | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,11 @@ jobs: | |
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: ccache | ||
uses: hendrikmuhs/[email protected] | ||
with: | ||
key: ${{ matrix.os }}-${{ matrix.python-version }} | ||
|
||
# see https://github.com/microsoft/setup-msbuild | ||
- name: Add msbuild to PATH | ||
if: startsWith(matrix.os, 'windows') | ||
|
@@ -52,6 +57,10 @@ jobs: | |
- name: Build | ||
shell: bash | ||
run: | | ||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache | ||
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | ||
cmake --version | ||
python3 -m pip install -U pip wheel numpy | ||
python3 setup.py bdist_wheel | ||
ls -lh dist | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,11 @@ jobs: | |
with: | ||
fetch-depth: 0 | ||
|
||
- name: ccache | ||
uses: hendrikmuhs/[email protected] | ||
with: | ||
key: ${{ matrix.os }}-${{ matrix.python-version }} | ||
|
||
- name: Setup Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
|
@@ -58,6 +63,10 @@ jobs: | |
- name: Build | ||
shell: bash | ||
run: | | ||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache | ||
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | ||
cmake --version | ||
mkdir build | ||
cd build | ||
cmake -DKALDIFST_BUILD_TESTS=ON .. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,11 @@ jobs: | |
with: | ||
fetch-depth: 0 | ||
|
||
- name: ccache | ||
uses: hendrikmuhs/[email protected] | ||
with: | ||
key: ${{ matrix.os }}-${{ matrix.python-version }} | ||
|
||
# see https://github.com/microsoft/setup-msbuild | ||
- name: Add msbuild to PATH | ||
if: startsWith(matrix.os, 'windows') | ||
|
@@ -57,6 +62,10 @@ jobs: | |
- name: Install kaldifst | ||
shell: bash | ||
run: | | ||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache | ||
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | ||
cmake --version | ||
pip3 install --verbose -U kaldifst | ||
- name: Display kaldifst version | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,11 @@ jobs: | |
with: | ||
fetch-depth: 0 | ||
|
||
- name: ccache | ||
uses: hendrikmuhs/[email protected] | ||
with: | ||
key: ${{ matrix.os }}-${{ matrix.python-version }} | ||
|
||
- name: Setup Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
|
@@ -47,6 +52,10 @@ jobs: | |
- name: Install kaldifst | ||
shell: bash | ||
run: | | ||
export CMAKE_CXX_COMPILER_LAUNCHER=ccache | ||
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | ||
cmake --version | ||
pip3 install --verbose kaldifst | ||
- name: Run test | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ set(srcs | |
kaldi-semaphore.cc | ||
kaldi-table.cc | ||
parse-options.cc | ||
text-normalizer.cc | ||
text-utils.cc | ||
) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
// kaldifst/csrc/text-normalizer.cc | ||
// | ||
// Copyright (c) 2023 Xiaomi Corporation | ||
|
||
#include "kaldifst/csrc/text-normalizer.h" | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <utility> | ||
|
||
#include "fst/arcsort.h" | ||
#include "kaldifst/csrc/kaldi-fst-io.h" | ||
#include "kaldifst/csrc/table-matcher.h" | ||
|
||
namespace fst { | ||
|
||
// This variable is copied from | ||
// https://github.com/pzelasko/Pynini/blob/master/src/stringcompile.h#L81 | ||
constexpr uint64_t kCompiledStringProps = | ||
kAcceptor | kIDeterministic | kODeterministic | kILabelSorted | | ||
kOLabelSorted | kUnweighted | kAcyclic | kInitialAcyclic | kTopSorted | | ||
kAccessible | kCoAccessible | kString | kUnweightedCycles; | ||
} // namespace fst | ||
|
||
namespace kaldifst { | ||
|
||
// We don't use StringCompiler<StdArc> here since it treats bytes as | ||
// signed integers. | ||
static fst::StdVectorFst StringToFst(const std::string &text) { | ||
using Weight = typename fst::StdArc::Weight; | ||
using Arc = fst::StdArc; | ||
|
||
fst::StdVectorFst ans; | ||
ans.ReserveStates(text.size()); | ||
|
||
auto s = ans.AddState(); | ||
ans.SetStart(s); | ||
// CAUTION(fangjun): We need to use uint8_t here. | ||
for (const uint8_t label : text) { | ||
const auto nextstate = ans.AddState(); | ||
ans.AddArc(s, Arc(label, label, Weight::One(), nextstate)); | ||
s = nextstate; | ||
} | ||
|
||
ans.SetFinal(s, Weight::One()); | ||
ans.SetProperties(fst::kCompiledStringProps, fst::kCompiledStringProps); | ||
|
||
return ans; | ||
} | ||
|
||
TextNormalizer::TextNormalizer(const std::string &rule) { | ||
rule_ = std::unique_ptr<fst::StdConstFst>( | ||
CastOrConvertToConstFst(fst::ReadFstKaldiGeneric(rule))); | ||
} | ||
|
||
TextNormalizer::TextNormalizer(std::unique_ptr<fst::StdConstFst> rule) | ||
: rule_(std::move(rule)) {} | ||
|
||
std::string TextNormalizer::Normalize(const std::string &s) const { | ||
// Step 1: Convert the input text into an FST | ||
fst::StdVectorFst text = StringToFst(s); | ||
|
||
// Step 2: Compose the input text with the rule FST | ||
fst::StdVectorFst composed_fst; | ||
fst::Compose(text, *rule_, &composed_fst); | ||
|
||
// Step 3: Get the best path from the composed FST | ||
fst::StdVectorFst one_best; | ||
fst::ShortestPath(composed_fst, &one_best, 1); | ||
|
||
// Step 4: Concatenate the output labels of the best path | ||
fst::StringPrinter<fst::StdArc> string_printer(fst::StringTokenType::BYTE); | ||
|
||
std::string normalized; | ||
string_printer(one_best, &normalized); | ||
|
||
return normalized; | ||
} | ||
|
||
} // namespace kaldifst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// kaldifst/csrc/text-normalizer.h | ||
// | ||
// Copyright (c) 2023 Xiaomi Corporation | ||
|
||
#ifndef KALDIFST_CSRC_TEXT_NORMALIZER_H_ | ||
#define KALDIFST_CSRC_TEXT_NORMALIZER_H_ | ||
|
||
#include <memory> | ||
#include <string> | ||
|
||
#include "fst/fst.h" | ||
#include "fst/fstlib.h" | ||
|
||
namespace kaldifst { | ||
|
||
class TextNormalizer { | ||
public: | ||
// Path to rule.fst | ||
explicit TextNormalizer(const std::string &rule); | ||
|
||
explicit TextNormalizer(std::unique_ptr<fst::StdConstFst> rule); | ||
|
||
std::string Normalize(const std::string &s) const; | ||
|
||
private: | ||
std::unique_ptr<fst::StdConstFst> rule_; | ||
}; | ||
|
||
} // namespace kaldifst | ||
|
||
#endif // KALDIFST_CSRC_TEXT_NORMALIZER_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
// kaldifst/python/csrc/text-normalizer.cc | ||
// | ||
// Copyright (c) 2023 Xiaomi Corporation | ||
|
||
#include "kaldifst/csrc/text-normalizer.h" | ||
|
||
#include <string> | ||
|
||
#include "kaldifst/python/csrc/text-normalizer.h" | ||
|
||
namespace kaldifst { | ||
|
||
void PybindTextNormalizer(py::module *m) { | ||
using PyClass = TextNormalizer; | ||
py::class_<PyClass>(*m, "TextNormalizer") | ||
.def(py::init<const std::string &>(), py::arg("rule")) | ||
.def("normalize", &PyClass::Normalize) | ||
.def("__call__", &PyClass::Normalize); | ||
} | ||
|
||
} // namespace kaldifst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// kaldifst/python/csrc/text-normalizer.h | ||
// | ||
// Copyright (c) 2023 Xiaomi Corporation | ||
#ifndef KALDIFST_PYTHON_CSRC_TEXT_NORMALIZER_H_ | ||
#define KALDIFST_PYTHON_CSRC_TEXT_NORMALIZER_H_ | ||
|
||
#include "kaldifst/python/csrc/kaldifst.h" | ||
namespace kaldifst { | ||
|
||
void PybindTextNormalizer(py::module *m); | ||
|
||
} | ||
|
||
#endif // KALDIFST_PYTHON_CSRC_TEXT_NORMALIZER_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters