Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subtitle #7

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
from bertalign import Bertalign
from bertalign.eval import *

src_dir = 'text+berg/Chinese' # 原始文件路径
tgt_dir = 'text+berg/English' # 译文文件路径 注:中英文对照文件名字要保持一致
output_dir = 'text+berg/output_alignments' # 新增:用于存储对齐结果的目录

# 创建存储对齐结果的目录
os.makedirs(output_dir, exist_ok=True)

test_alignments = [] # 定义空的列表来存储对齐结果

for file in os.listdir(src_dir):
src_file = os.path.join(src_dir, file).replace("\\", "/")
tgt_file = os.path.join(tgt_dir, file).replace("\\", "/")

if os.path.isdir(src_file) or os.path.isdir(tgt_file):
continue

src = open(src_file, 'rt', encoding='utf-8').read()
tgt = open(tgt_file, 'rt', encoding='utf-8').read()

print("Start aligning {} to {}".format(src_file, tgt_file))
aligner = Bertalign(src, tgt, is_split=True)
aligner.align_sents()

# 构建输出文件路径
output_file = os.path.join(output_dir, f"{file}.txt")

# 将对齐结果写入文件
aligner.write_sents_to_file(output_file)
test_alignments.append(aligner.result)
101 changes: 83 additions & 18 deletions bertalign/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from bertalign.corelib import *
from bertalign.utils import *


class Bertalign:
def __init__(self,
src,
Expand All @@ -15,33 +16,51 @@ def __init__(self,
margin=True,
len_penalty=True,
is_split=False,
):
):

self.max_align = max_align
self.top_k = top_k
self.win = win
self.skip = skip
self.margin = margin
self.len_penalty = len_penalty


src = src.replace("\u3000", " ")
src = src.replace("。」<br/>", "%。")
src = src.replace("!」<br/>", "#。")
src = src.replace("?」<br/>", "&。")
src = src.replace("。」", "%。")
src = src.replace("!」", "#。")
src = src.replace("?」", "&。")
src = src.replace("。<br/>", "$。")
src = src.replace("?<br/>", "*。")
src = src.replace("!<br/>", "_。")
src = src.replace("<br/>", "+。")

tgt = tgt.replace(".<br/>", "#.")
tgt = tgt.replace("?<br/>", "$.")
tgt = tgt.replace("?”<br/>", "%.")
tgt = tgt.replace(".”<br/>", "&.")
tgt = tgt.replace("<br/>", "+.")

src = clean_text(src)
tgt = clean_text(tgt)
src_lang = detect_lang(src)
tgt_lang = detect_lang(tgt)

if is_split:
src_sents = src.splitlines()
tgt_sents = tgt.splitlines()
else:
src_sents = split_sents(src, src_lang)
tgt_sents = split_sents(tgt, tgt_lang)

src_num = len(src_sents)
tgt_num = len(tgt_sents)

src_lang = LANG.ISO[src_lang]
tgt_lang = LANG.ISO[tgt_lang]

print("Source language: {}, Number of sentences: {}".format(src_lang, src_num))
print("Target language: {}, Number of sentences: {}".format(tgt_lang, tgt_num))

Expand All @@ -62,36 +81,82 @@ def __init__(self,
self.char_ratio = char_ratio
self.src_vecs = src_vecs
self.tgt_vecs = tgt_vecs

def align_sents(self):

print("Performing first-step alignment ...")
D, I = find_top_k_sents(self.src_vecs[0,:], self.tgt_vecs[0,:], k=self.top_k)
first_alignment_types = get_alignment_types(2) # 0-1, 1-0, 1-1
D, I = find_top_k_sents(self.src_vecs[0, :], self.tgt_vecs[0, :], k=self.top_k)
first_alignment_types = get_alignment_types(2) # 0-1, 1-0, 1-1
first_w, first_path = find_first_search_path(self.src_num, self.tgt_num)
first_pointers = first_pass_align(self.src_num, self.tgt_num, first_w, first_path, first_alignment_types, D, I)
first_alignment = first_back_track(self.src_num, self.tgt_num, first_pointers, first_path, first_alignment_types)

first_alignment = first_back_track(self.src_num, self.tgt_num, first_pointers, first_path,
first_alignment_types)

print("Performing second-step alignment ...")
second_alignment_types = get_alignment_types(self.max_align)
second_w, second_path = find_second_search_path(first_alignment, self.win, self.src_num, self.tgt_num)
second_pointers = second_pass_align(self.src_vecs, self.tgt_vecs, self.src_lens, self.tgt_lens,
second_w, second_path, second_alignment_types,
self.char_ratio, self.skip, margin=self.margin, len_penalty=self.len_penalty)
second_alignment = second_back_track(self.src_num, self.tgt_num, second_pointers, second_path, second_alignment_types)

print("Finished! Successfully aligning {} {} sentences to {} {} sentences\n".format(self.src_num, self.src_lang, self.tgt_num, self.tgt_lang))
self.char_ratio, self.skip, margin=self.margin,
len_penalty=self.len_penalty)
second_alignment = second_back_track(self.src_num, self.tgt_num, second_pointers, second_path,
second_alignment_types)

print("Finished! Successfully aligning {} {} sentences to {} {} sentences\n".format(self.src_num, self.src_lang,
self.tgt_num,
self.tgt_lang))
self.result = second_alignment

def print_sents(self):
for bead in (self.result):
src_line = self._get_line(bead[0], self.src_sents)
src_line = src_line.replace("$。", "。")
src_line = src_line.replace("_。", "!")
src_line = src_line.replace("*。", "?")
src_line = src_line.replace("%。", "。」")
src_line = src_line.replace("#。", "!」")
src_line = src_line.replace("&。", "?」")
if "+。" in src_line:
src_line = f"<h4>{src_line}</h4>"
src_line = src_line.replace("+。", "")

tgt_line = self._get_line(bead[1], self.tgt_sents)
tgt_line = tgt_line.replace("#.", ".")
tgt_line = tgt_line.replace("$.", "?")
tgt_line = tgt_line.replace("%.", "?”")
tgt_line = tgt_line.replace("&.", ".”")
if "+." in tgt_line:
tgt_line = f"<h4>{tgt_line}</h4>"
tgt_line = tgt_line.replace("+.", "")
print(src_line + "\n" + tgt_line + "\n")

def write_sents_to_file(self, output_file):
with open(output_file, 'w', encoding='utf-8') as file:
for bead in self.result:
src_line = self._get_line(bead[0], self.src_sents)
src_line = src_line.replace("$。", "。")
src_line = src_line.replace("_。", "!")
src_line = src_line.replace("*。", "?")
src_line = src_line.replace("%。", "。」")
src_line = src_line.replace("#。", "!」")
src_line = src_line.replace("&。", "?」")
if "+。" in src_line:
src_line = f"<h4>{src_line}</h4>"
src_line = src_line.replace("+。", "")

tgt_line = self._get_line(bead[1], self.tgt_sents)
tgt_line = tgt_line.replace("#.", ".")
tgt_line = tgt_line.replace("$.", "?")
tgt_line = tgt_line.replace("%.", "?”")
tgt_line = tgt_line.replace("&.", ".”")
if "+." in tgt_line:
tgt_line = f"<h4>{tgt_line}</h4>"
tgt_line = tgt_line.replace("+.", "")
file.write(src_line + "\n" + tgt_line + "\n")

@staticmethod
def _get_line(bead, lines):
line = ''
if len(bead) > 0:
line = ' '.join(lines[bead[0]:bead[-1]+1])
line = ' '.join(lines[bead[0]:bead[-1] + 1])
return line
2 changes: 1 addition & 1 deletion bertalign/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def clean_text(text):
line = re.sub('\s+', ' ', line)
clean_text.append(line)
return "\n".join(clean_text)

def detect_lang(text):
translator = Translator(service_urls=[
'translate.google.com.hk',
Expand Down
36 changes: 36 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-


from bertalign import Bertalign
src = """
去年報載華視連續劇《包青天》受到觀眾熱烈歡迎。
弟子們說:「師父就像是佛光山的包青天,常常及時伸出正義的援手,專門為大家排難解紛。」<br/>
設身處地\u3000謀求大家滿意<br/>
人間佛教<br/>
回憶自我懂事以來,就經常看到母親為鄰里親友排難解紛,記得曾經有人向他說:「何必多管閒事呢?」
母親聽了,正色答道:「排難解紛能促進別人的和諧美滿,是正事,怎麼能說是閒事呢?」
及至行腳台灣,先是落腳在佛寺中,搬柴、運水、拉車、採購……無所不做。<br/>
在耳濡目染下,我也繼承了太虛法師的性格,一直都很喜歡幫助別人化解紛爭,而且並不一定是佛光山的徒眾,我才特意關懷照顧!<br/>
"""

tgt = """
A TV drama series depicting the life of Pao Ch’ing-t’ien (also known as Pao Cheng) was the most watched television show in Taiwan several years ago.
The disciples said, "Master is just like Bao Qingtian of Foguang Mountain. He always offers a just helping hand in time and solves problems for everyone.”<br/>
Put Ourselves in Other People’s Places and Act on Their Behalf<br/>
Humanistic Buddhism<br/>
My disciples have often said about me, “Master is the Pao Ch’ing-t’ien of Fo Guang Shan because whenever there is a dispute, he promptly lends a hand and settles it justly.”<br/>
I inherited my mother’s character?<br/>
As far as I can remember, she served as mediator for quarreling neighbors and relatives.<br/>
Someone once asked her, “Why must you meddle in others’ affairs?”<br/>
“To settle conflicts,” my mother sternly replied, “is no trifling matter; it is a serious business because it promotes harmony and happiness in people’s lives.”<br/>
When later my wanderings in search of Buddhist teaching took me as far away as Taiwan, I first settled in a monastery, where I carried firewood, hauled water, pulled carts, made purchases, and patrolled the mountainscape night and day.
I have also inherited the character of Ven. Taixu, and I have always liked to help others resolve disputes.<br/>
Nor are my care and concern limited to the disciples and followers of Fo Guang Shan.
"""

aligner = Bertalign(src, tgt)
aligner.align_sents()
aligner.print_sents()

#output_file = "alignment_result.txt" # 生成文件名,附带后缀,一般是txt,例“佛教概论.txt”
#aligner.write_sents_to_file(output_file)
Loading