diff --git a/cube/api.py b/cube/api.py index 9edfe301..26f02ffe 100644 --- a/cube/api.py +++ b/cube/api.py @@ -126,10 +126,7 @@ def __call__(self, text: Union[str, Document], flavour: Optional[str] = None): self._lm_helper.apply(doc) self._parser.process(doc, self._parser_collate, num_workers=0) self._lemmatizer.process(doc, self._lemmatizer_collate, num_workers=0) - for seq in doc.sentences: - for w in seq.words: - if w.upos =='PUNCT': - w.lemma = w.word + return doc diff --git a/cube/io_utils/config.py b/cube/io_utils/config.py index 9a3cd0d9..a8c23b82 100644 --- a/cube/io_utils/config.py +++ b/cube/io_utils/config.py @@ -87,7 +87,10 @@ def __init__(self, filename=None, verbose=False): self.cnn_filter = 512 self.lang_emb_size = 100 self.cnn_layers = 5 - self.external_proj_size = 300 + self.rnn_size = 50 + self.rnn_layers = 2 + self.external_proj_size = 2 + self.no_space_lang = False if filename is None: @@ -139,9 +142,10 @@ def __init__(self, filename=None, verbose=False): self.head_size = 100 self.label_size = 200 self.lm_model = 'xlm-roberta-base' - self.external_proj_size = 300 + self.external_proj_size = 2 self.rhl_win_size = 2 - self.rnn_size = 50 + self.rnn_size = 200 + self.rnn_layers = 3 self._valid = True @@ -275,6 +279,26 @@ def __init__(self, filename=None, verbose=False): self.load(filename) +class DCWEConfig(Config): + def __init__(self, filename=None, verbose=False): + super().__init__() + self.char_emb_size = 256 + self.case_emb_size = 32 + self.num_filters = 512 + self.kernel_size = 5 + self.lang_emb_size = 32 + self.num_layers = 8 + self.output_size = 300 # this will be automatically updated at training time, so do not change + + if filename is None: + if verbose: + sys.stdout.write("No configuration file supplied. Using default values.\n") + else: + if verbose: + sys.stdout.write("Reading configuration file " + filename + " \n") + self.load(filename) + + class GDBConfig(Config): def __init__(self, filename=None, verbose=False): super().__init__() diff --git a/cube/networks/dcwe.py b/cube/networks/dcwe.py new file mode 100644 index 00000000..a0549d6e --- /dev/null +++ b/cube/networks/dcwe.py @@ -0,0 +1,83 @@ +import torch +import torch.nn as nn +import pytorch_lightning as pl +from typing import * +import sys + +sys.path.append('') +from cube.networks.modules import WordGram, LinearNorm +from cube.io_utils.encodings import Encodings +from cube.io_utils.config import DCWEConfig + + +class DCWE(pl.LightningModule): + encodings: Encodings + config: DCWEConfig + + def __init__(self, config: DCWEConfig, encodings: Encodings): + super(DCWE, self).__init__() + self._config = config + self._encodings = encodings + self._wg = WordGram(num_chars=len(encodings.char2int), + num_langs=encodings.num_langs, + num_layers=config.num_layers, + num_filters=config.num_filters, + char_emb_size=config.lang_emb_size, + case_emb_size=config.case_emb_size, + lang_emb_size=config.lang_emb_size + ) + self._output_proj = LinearNorm(config.num_filters // 2, config.output_size, w_init_gain='linear') + self._improve = 0 + self._best_loss = 9999 + + def forward(self, x_char, x_case, x_lang, x_mask, x_word_len): + pre_proj = self._wg(x_char, x_case, x_lang, x_mask, x_word_len) + proj = self._output_proj(pre_proj) + return proj + + def _get_device(self): + if self._output_proj.linear_layer.weight.device.type == 'cpu': + return 'cpu' + return '{0}:{1}'.format(self._output_proj.linear_layer.weight.device.type, + str(self._output_proj.linear_layer.weight.device.index)) + + def configure_optimizers(self): + return torch.optim.AdamW(self.parameters()) + + def training_step(self, batch, batch_idx): + x_char = batch['x_char'] + x_case = batch['x_case'] + x_lang = batch['x_lang'] + x_word_len = batch['x_word_len'] + x_mask = batch['x_mask'] + y_target = batch['y_target'] + y_pred = self.forward(x_char, x_case, x_lang, x_mask, x_word_len) + loss = torch.mean((y_pred - y_target) ** 2) + return loss + + def validation_step(self, batch, batch_idx): + x_char = batch['x_char'] + x_case = batch['x_case'] + x_lang = batch['x_lang'] + x_word_len = batch['x_word_len'] + x_mask = batch['x_mask'] + y_target = batch['y_target'] + y_pred = self.forward(x_char, x_case, x_lang, x_mask, x_word_len) + loss = torch.mean((y_pred - y_target) ** 2) + return {'loss': loss.detach().cpu().numpy()[0]} + + def validation_epoch_end(self, outputs: List[Any]) -> None: + mean_loss = sum([output['loss'] for output in outputs]) + mean_loss /= len(outputs) + self.log('val/loss', mean_loss) + self.log('val/early_meta', self._improve) + + def save(self, path): + torch.save(self.state_dict(), path) + + def load(self, model_path: str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) + self.to(device) + + + diff --git a/cube/networks/lm.py b/cube/networks/lm.py index 8fb1d358..04775565 100644 --- a/cube/networks/lm.py +++ b/cube/networks/lm.py @@ -211,6 +211,27 @@ def apply_raw(self, batch): pass +class LMHelperDummy(LMHelper): + def __init__(self, device: str = 'cpu', model: str = None): + pass + + def get_embedding_size(self): + return [1] + + def apply(self, document: Document): + for ii in tqdm.tqdm(range(len(document.sentences)), desc="Pre-computing embeddings", unit="sent"): + for jj in range(len(document.sentences[ii].words)): + document.sentences[ii].words[jj].emb = [[1.0]] + + def apply_raw(self, batch): + embeddings = [] + for ii in range(len(batch)): + c_emb = [] + for jj in range(len(batch[ii])): + c_emb.append([1.0]) + embeddings.append(c_emb) + return embeddings + if __name__ == "__main__": from ipdb import set_trace diff --git a/cube/networks/modules.py b/cube/networks/modules.py index 09992c3c..6a8509bf 100644 --- a/cube/networks/modules.py +++ b/cube/networks/modules.py @@ -427,9 +427,10 @@ def __init__(self, num_chars: int, num_langs: int, num_filters=512, char_emb_siz super(WordGram, self).__init__() NUM_FILTERS = num_filters self._num_filters = NUM_FILTERS - self._lang_emb = nn.Embedding(num_langs + 1, lang_emb_size) - self._tok_emb = nn.Embedding(num_chars + 1, char_emb_size) - self._case_emb = nn.Embedding(4, case_emb_size) + self._lang_emb = nn.Embedding(num_langs + 1, lang_emb_size, padding_idx=0) + self._tok_emb = nn.Embedding(num_chars + 3, char_emb_size, padding_idx=0) + self._case_emb = nn.Embedding(4, case_emb_size, padding_idx=0) + self._num_layers = num_layers convolutions_char = [] cs_inp = char_emb_size + lang_emb_size + case_emb_size diff --git a/cube/networks/parser.py b/cube/networks/parser.py index b5c0ba6c..39f6b033 100644 --- a/cube/networks/parser.py +++ b/cube/networks/parser.py @@ -76,7 +76,8 @@ def __init__(self, config: ParserConfig, encodings: Encodings, language_codes: [ self._upos_emb = nn.Embedding(len(encodings.upos2int), 64) self._rnn = nn.LSTM(NUM_FILTERS // 2 + config.lang_emb_size + config.external_proj_size, config.rnn_size, - num_layers=config.rnn_layers, batch_first=True, bidirectional=True, dropout=0.33) + num_layers=config.rnn_layers, batch_first=True, bidirectional=True, dropout=0.1) + self._pre_out = LinearNorm(config.rnn_size * 2 + config.lang_emb_size, config.pre_parser_size) # self._head_r1 = LinearNorm(config.pre_parser_size, config.head_size) @@ -137,9 +138,10 @@ def forward(self, X): for ii in range(len(x_word_emb_packed)): we = unpack(x_word_emb_packed[ii], sl, x_sents.shape[1], self._get_device()) if word_emb_ext is None: - word_emb_ext = self._ext_proj[ii](we.float()) + word_emb_ext = self._ext_proj[ii](we) else: - word_emb_ext = word_emb_ext + self._ext_proj[ii](we.float()) + word_emb_ext = word_emb_ext + self._ext_proj[ii](we) + word_emb_ext = word_emb_ext / len(x_word_emb_packed) word_emb_ext = torch.tanh(word_emb_ext) @@ -153,7 +155,8 @@ def forward(self, X): word_emb = self._word_emb(x_sents) - x = mask_concat([word_emb, char_emb, word_emb_ext], 0.33, self.training, self._get_device()) + x = mask_concat([word_emb, char_emb, word_emb_ext], 0.1, self.training, self._get_device()) + x = torch.cat([x, lang_emb[:, 1:, :]], dim=-1) # prepend root @@ -172,7 +175,8 @@ def forward(self, X): res = tmp else: res = res + tmp - x = torch.dropout(tmp, 0.2, self.training) + x = torch.dropout(tmp, 0.1, self.training) + cnt += 1 if cnt == self._config.aux_softmax_location: hidden = torch.cat([x + res, lang_emb], dim=1) @@ -184,7 +188,8 @@ def forward(self, X): # aux tagging lang_emb = lang_emb.permute(0, 2, 1) hidden = hidden.permute(0, 2, 1)[:, 1:, :] - pre_morpho = torch.dropout(torch.tanh(self._pre_morpho(hidden)), 0.33, self.training) + pre_morpho = torch.dropout(torch.tanh(self._pre_morpho(hidden)), 0.1, self.training) + pre_morpho = torch.cat([pre_morpho, lang_emb[:, 1:, :]], dim=2) upos = self._upos(pre_morpho) if gs_upos is None: @@ -200,11 +205,12 @@ def forward(self, X): word_emb_ext = torch.cat( [torch.zeros((word_emb_ext.shape[0], 1, self._config.external_proj_size), device=self._get_device(), dtype=torch.float), word_emb_ext], dim=1) - x = mask_concat([x_parse, word_emb_ext], 0.33, self.training, self._get_device()) + x = torch.cat([x_parse, word_emb_ext], dim=-1) #mask_concat([x_parse, word_emb_ext], 0.1, self.training, self._get_device()) x = torch.cat([x, lang_emb], dim=-1) output, _ = self._rnn(x) output = torch.cat([output, lang_emb], dim=-1) - pre_parsing = torch.dropout(torch.tanh(self._pre_out(output)), 0.33, self.training) + pre_parsing = torch.dropout(torch.tanh(self._pre_out(output)), 0.1, self.training) + # h_r1 = torch.tanh(self._head_r1(pre_parsing)) # h_r2 = torch.tanh(self._head_r2(pre_parsing)) # l_r1 = torch.tanh(self._label_r1(pre_parsing)) diff --git a/cube/networks/tagger.py b/cube/networks/tagger.py index 6968d076..37e38dfb 100644 --- a/cube/networks/tagger.py +++ b/cube/networks/tagger.py @@ -1,6 +1,9 @@ import sys + sys.path.append('') import os, yaml + + os.environ["TOKENIZERS_PARALLELISM"] = "false" import pytorch_lightning as pl import torch.nn as nn @@ -14,6 +17,7 @@ from cube.networks.utils import MorphoCollate, MorphoDataset, unpack, mask_concat from cube.networks.modules import WordGram + class Tagger(pl.LightningModule): def __init__(self, config: TaggerConfig, encodings: Encodings, language_codes: [] = None, ext_word_emb=0): super().__init__() @@ -276,7 +280,8 @@ def validation_epoch_end(self, outputs): # print("\n\n\n", upos_ok / total, xpos_ok / total, attrs_ok / total, # aupos_ok / total, axpos_ok / total, aattrs_ok / total, "\n\n\n") - def load(self, model_path:str, device: str = 'cpu'): + def load(self, model_path: str, device: str = 'cpu'): + self.load_state_dict(torch.load(model_path, map_location='cpu')['state_dict']) self.to(device) diff --git a/cube/networks/tokenizer.py b/cube/networks/tokenizer.py index 7398bc2d..895d217b 100644 --- a/cube/networks/tokenizer.py +++ b/cube/networks/tokenizer.py @@ -39,8 +39,8 @@ def __init__(self, config: TokenizerConfig, encodings: Encodings, language_codes conv_layer = nn.Sequential( ConvNorm(cs_inp, NUM_FILTERS, - kernel_size=5, stride=1, - padding=2, + kernel_size=3, stride=1, + padding=1, dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(NUM_FILTERS)) conv_layers.append(conv_layer) @@ -49,7 +49,13 @@ def __init__(self, config: TokenizerConfig, encodings: Encodings, language_codes self._wg = WordGram(len(encodings.char2int), num_langs=encodings.num_langs) self._lang_emb = nn.Embedding(encodings.num_langs + 1, config.lang_emb_size, padding_idx=0) self._spa_emb = nn.Embedding(3, 16, padding_idx=0) - self._output = LinearNorm(NUM_FILTERS // 2 + config.lang_emb_size, 5) + self._rnn = nn.LSTM(NUM_FILTERS // 2 + config.lang_emb_size, + config.rnn_size, + num_layers=config.rnn_layers, + bidirectional=True, + batch_first=True) + self._output = LinearNorm(config.rnn_size * 2, 5) + ext2int = [] for input_size in self._ext_word_emb: @@ -103,6 +109,8 @@ def forward(self, batch): half = self._config.cnn_filter // 2 res = None cnt = 0 + + skip = None for conv in self._convs: conv_out = conv(x) tmp = torch.tanh(conv_out[:, :half, :]) * torch.sigmoid((conv_out[:, half:, :])) @@ -110,13 +118,20 @@ def forward(self, batch): res = tmp else: res = res + tmp - x = torch.dropout(tmp, 0.2, self.training) + x = torch.dropout(tmp, 0.1, self.training) cnt += 1 if cnt != self._config.cnn_layers: + if skip is not None: + x = x + skip + skip = x + x = torch.cat([x, x_lang], dim=1) x = x + res x = torch.cat([x, x_lang], dim=1) x = x.permute(0, 2, 1) + + x, _ = self._rnn(x) + return self._output(x) def validation_step(self, batch, batch_idx): @@ -297,7 +312,9 @@ def process(self, raw_text, collate: TokenCollate, batch_size=32, num_workers: i return d def configure_optimizers(self): - return torch.optim.AdamW(self.parameters()) + optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=1e-4) + return optimizer + def _compute_early_stop(self, res): for lang in res: diff --git a/cube/networks/utils.py b/cube/networks/utils.py index 290d7e1b..b8f4718f 100644 --- a/cube/networks/utils.py +++ b/cube/networks/utils.py @@ -106,6 +106,8 @@ def __init__(self, document: Document, for_training=True): word = w.word lemma = w.lemma upos = w.upos + if len(word) > 25: + continue key = (word, lang_id, upos) if key not in lookup or for_training is False: diff --git a/cube/networks/utils_dcwe.py b/cube/networks/utils_dcwe.py new file mode 100644 index 00000000..aa00bfe5 --- /dev/null +++ b/cube/networks/utils_dcwe.py @@ -0,0 +1,89 @@ +import sys +import random +from typing import * + +sys.path.append('') +import numpy as np +import torch +from torch.utils.data.dataset import Dataset + +from cube.io_utils.objects import Document, Sentence, Token, Word +from cube.io_utils.encodings import Encodings + +from collections import namedtuple + + +class DCWEDataset(Dataset): + def __init__(self): + self._examples = [] + + def __len__(self): + return len(self._examples) + + def __getitem__(self, item): + return self._examples[item] + + def load_language(self, filename: str, lang: str): + f = open(filename, encoding='utf-8') + parts = f.readline().strip().split(' ') + num_examples = int(parts[0]) + vector_len = int(parts[1]) + for ii in range(num_examples): + parts = f.readline().strip().split(' ') + word = parts[0] + vector = [float(pp) for pp in parts[1:]] + self._examples.append([lang, word, vector]) + f.close() + + +class DCWECollate: + encodings: Encodings + examples: List[Any] + + def __init__(self, encodings: Encodings): + self.encodings = encodings + self._start = len(encodings.char2int) + self._stop = len(encodings.char2int) + 1 + + def collate_fn(self, examples): + langs = [] + vectors = [] + words = [] + for example in examples: + langs.append(example[0]) + words.append(example[1]) + vectors.append(example[2]) + + max_word_len = max([len(word) for word in words]) + 2 + x_char = np.zeros((len(examples), max_word_len), dtype=np.long) + x_case = np.zeros((len(examples), max_word_len), dtype=np.long) + x_word_len = np.zeros((len(examples)), dtype=np.long) + x_mask = np.ones((len(examples), 1)) + x_lang = np.ones((len(examples), 1)) + for ii in range(len(words)): + word = words[ii] + x_char[ii, 0] = self._start + for jj in range(word): + char = word[jj] + ch_low = char.lower() + if ch_low in self.encodings.char2int: + x_char[ii, jj + 1] = self.encodings.char2int[ch_low] + else: + x_char[ii, jj + 1] = 1 # UNK + if char.lower() == char.upper(): + x_case[ii, jj + 1] = 1 + elif ch_low == char: + x_case[ii, jj + 1] = 2 + else: + x_case[ii, jj + 1] = 3 + + x_char[len(word) + 1] = self._stop + x_word_len[ii] = len(word) + x_lang = self.encodings.lang2int[langs[ii]] + + collated = {'y_target': torch.tensor(np.array(vectors)), + 'x_char': torch.tensor(x_char), + 'x_case': torch.tensor(x_case), + 'x_mask': torch.tensor(x_mask), + 'x_lang': torch.tensor(x_lang) + 'x_word_len': torch.tensor(x_word_len)} diff --git a/cube/networks/utils_tokenizer.py b/cube/networks/utils_tokenizer.py index 4879c62b..f2d57a36 100644 --- a/cube/networks/utils_tokenizer.py +++ b/cube/networks/utils_tokenizer.py @@ -8,7 +8,7 @@ from transformers import AutoModel, AutoTokenizer from cube.io_utils.encodings import Encodings from cube.io_utils.objects import Sentence -from cube.networks.lm import LMHelperLanguasito, LMHelperFT +from cube.networks.lm import LMHelperLanguasito, LMHelperFT, LMHelperDummy from torch.utils.data.dataset import Dataset @@ -130,6 +130,9 @@ def __init__(self, encodings: Encodings, lm_model: str = None, lm_device: str = elif parts[0] == 'languasito': self._lm_helper = LMHelperLanguasito(device=lm_device, model=parts[1]) self._emb_size = [512] + elif parts[0] == 'dummy': + self._lm_helper = LMHelperDummy(device=lm_device, model=None) + self._emb_size = [1] else: print("UserWarning: unsupported LM type for tokenizer") @@ -349,6 +352,10 @@ def __setstate__(self, state): elif parts[0] == 'languasito': self._lm_helper = LMHelperLanguasito(device=self._lm_device, model=parts[1]) self._emb_size = [512] + elif parts[0] == 'dummy': + self._lm_helper = LMHelperDummy(device=self._lm_device, model=None) + self._emb_size = [1] + class TokenCollateHF(TokenCollate): diff --git a/cube/trainer.py b/cube/trainer.py index 6dc0e089..ab2b00f8 100644 --- a/cube/trainer.py +++ b/cube/trainer.py @@ -21,7 +21,8 @@ from cube.networks.utils import MorphoDataset, MorphoCollate, TokenizationDataset, \ Word2TargetCollate, LemmaDataset, CompoundDataset from cube.networks.utils_tokenizer import TokenCollateHF, TokenCollateFTLanguasito -from cube.networks.lm import LMHelperFT, LMHelperHF, LMHelperLanguasito +from cube.networks.lm import LMHelperDummy, LMHelperFT, LMHelperHF, LMHelperLanguasito + class Trainer(): @@ -103,7 +104,8 @@ def fit(self): if self.task != "tokenizer" and self.task != 'lemmatizer' and self.task != 'cwe': lm_model = config.lm_model parts = lm_model.split(':') - if parts[0] not in ['transformer', 'fasttext', 'languasito']: + + if parts[0] not in ['transformer', 'fasttext', 'languasito', 'dummy']: print("Error: model prefix should be in the form of transformer: fasttext: or languasito:") sys.exit(0) if parts[0] == 'transformer': @@ -112,6 +114,9 @@ def fit(self): helper = LMHelperFT(device=self.args.lm_device, model=parts[1]) elif parts[0] == 'languasito': helper = LMHelperLanguasito(device=self.args.lm_device, model=parts[1]) + elif parts[0] == 'dummy': + helper = LMHelperDummy(device=self.args.lm_device) + helper.apply(self.doc_dev) helper.apply(self.doc_train) @@ -197,7 +202,9 @@ def fit(self): model = Compound(config=config, encodings=enc, language_codes=self.language_codes) # extra check to see if there is actually any compound in this language if len(trainset._examples) == 0 or len(devset._examples) == 0: - print("\nTrain/dev data for this language does not contain any compound words; there is nothing to train.") + print( + "\nTrain/dev data for this language does not contain any compound words; there is nothing to train.") + return # dataloaders @@ -223,11 +230,12 @@ def fit(self): trainer = pl.Trainer( gpus=args.gpus, accelerator=args.accelerator, - #num_nodes=1, + # num_nodes=1, default_root_dir='data/', callbacks=callbacks, resume_from_checkpoint=resume_from_checkpoint, accumulate_grad_batches=args.accumulate_grad_batches, + gradient_clip_val=1.0, # limit_train_batches=100, # limit_val_batches=4, ) @@ -259,7 +267,7 @@ def fit(self): help='Where to load LM (default=cuda:0)') parser.add_argument('--config', action='store', dest='config_file', help='Load config file') - parser = pl.Trainer.add_argparse_args(parser) # add all pytorch lightning params here as well + parser = pl.Trainer.add_argparse_args(parser) # add all pytorch lightning params here as well args = parser.parse_args() diff --git a/requirements.txt b/requirements.txt index 8c14e4a3..ec9252b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,8 @@ regex torch tqdm configparser -pytorch_lightning +pytorch_lightning==1.2.10 + transformers==4.2.2 sentencepiece diff --git a/setup.py b/setup.py index c666c2ba..895fa2af 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ def parse_requirements(filename, session=None): setuptools.setup( name="nlpcube", version="0.3.1.1", + author="Multiple authors", author_email="tiberiu44@gmail.com", description="Natural Language Processing Toolkit with support for tokenization, sentence splitting, lemmatization, tagging and parsing for more than 60 languages",