diff --git a/aquilign/align/bertalign/Bertalign.py b/aquilign/align/bertalign/Bertalign.py index 957c9ac..4cdbe11 100644 --- a/aquilign/align/bertalign/Bertalign.py +++ b/aquilign/align/bertalign/Bertalign.py @@ -5,7 +5,7 @@ __author__ = "Jason (bfsujason@163.com)" __version__ = "1.1.0" -from bertalign.encoder import Encoder +from aquilign.align.bertalign.encoder import Encoder # from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline # See other cross-lingual embedding models at # https://www.sbert.net/docs/pretrained_models.html @@ -19,4 +19,4 @@ # nb = input(f'Choose a model:') model = Encoder(models[int(1)]) -from bertalign.aligner import Bertalign +from aquilign.align.bertalign.aligner import Bertalign diff --git a/aquilign/align/bertalign/aligner.py b/aquilign/align/bertalign/aligner.py index f070670..0f629c3 100644 --- a/aquilign/align/bertalign/aligner.py +++ b/aquilign/align/bertalign/aligner.py @@ -1,8 +1,8 @@ import numpy as np -from bertalign.Bertalign import model -import bertalign.corelib as core -import bertalign.utils as utils +from aquilign.align.bertalign.Bertalign import model +import aquilign.align.bertalign.corelib as core +import aquilign.align.bertalign.utils as utils import torch.nn as nn import torch diff --git a/aquilign/align/bertalign/encoder.py b/aquilign/align/bertalign/encoder.py index 4b26c03..40d0fb3 100644 --- a/aquilign/align/bertalign/encoder.py +++ b/aquilign/align/bertalign/encoder.py @@ -1,7 +1,7 @@ import numpy as np import torch from sentence_transformers import SentenceTransformer -from bertalign.utils import yield_overlaps +from aquilign.align.bertalign.utils import yield_overlaps # from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline diff --git a/aquilign/align/bertalign/syntactic_tokenization.py b/aquilign/align/bertalign/syntactic_tokenization.py index 3a5860f..2492c55 100644 --- a/aquilign/align/bertalign/syntactic_tokenization.py +++ b/aquilign/align/bertalign/syntactic_tokenization.py @@ -5,7 +5,7 @@ import json import sys import langid -import bertalign.utils as utils +import aquilign.align.bertalign.utils as utils def syntactic_tokenization(path, corpus_limit=None, use_punctuation=True): name = path.split("/")[-1].split(".")[0] @@ -16,7 +16,7 @@ def syntactic_tokenization(path, corpus_limit=None, use_punctuation=True): codelang, _ = langid.classify(text[:300]) print(text) print(codelang) - with open("bertalign/delimiters.json", "r") as input_json: + with open("aquilign/align/bertalign/delimiters.json", "r") as input_json: dictionary = json.load(input_json) # Il ne reconnaƮt pas toujours le castillan if codelang == "an": diff --git a/main.py b/main.py index 308135c..ffbb0fa 100644 --- a/main.py +++ b/main.py @@ -6,10 +6,10 @@ import sys import numpy as np # import collatex -import graph_merge -import bertalign.utils as utils -import bertalign.syntactic_tokenization as syntactic_tokenization -from bertalign.Bertalign import Bertalign +import aquilign.align.graph_merge as graph_merge +import aquilign.align.bertalign.utils as utils +import aquilign.align.bertalign.syntactic_tokenization as syntactic_tokenization +from aquilign.align.bertalign.Bertalign import Bertalign import pandas as pd import argparse import glob @@ -81,6 +81,10 @@ def __init__(self, corpus_size:None, self.use_punctiation = use_punctuation self.prefix = prefix + try: + os.mkdir(f"result_dir") + except FileExistsError: + pass try: os.mkdir(f"result_dir/{self.out_dir}/") except FileExistsError: @@ -219,7 +223,7 @@ def run_alignments(): prefix = args.prefix use_punctuation = args.use_punctuation print(f"Punctuation for tokenization: {use_punctuation}") - MyAligner = Aligner(corpus_size=None, max_align=3, out_dir=out_dir, use_punctuation=use_punctuation, input_dir=input_dir, main_wit=main_wit, prefix=prefix) + MyAligner = Aligner(corpus_size=100, max_align=3, out_dir=out_dir, use_punctuation=use_punctuation, input_dir=input_dir, main_wit=main_wit, prefix=prefix) MyAligner.parallel_align() utils.write_json(f"result_dir/{out_dir}/alignment_dict.json", MyAligner.alignment_dict) align_dict = utils.read_json(f"result_dir/{out_dir}/alignment_dict.json")