bucc.py

#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER  Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
#
# Python tools for BUCC bitext mining

import argparse

###############################################################################
#
# Find te optimal threshold given gold alignments
#
###############################################################################

def BuccOptimize(candidate2score, gold):
    items = sorted(candidate2score.items(), key=lambda x: -x[1])
    ngold = len(gold)
    nextract = ncorrect = 0
    threshold = 0
    best_f1 = 0
    for i in range(len(items)):
        nextract += 1
        if '\t'.join(items[i][0]) in gold:
            ncorrect += 1
        if ncorrect > 0:
            precision = ncorrect / nextract
            recall = ncorrect / ngold
            f1 = 2 * precision * recall / (precision + recall)
            if f1 > best_f1:
                best_f1 = f1
                threshold = (items[i][1] + items[i + 1][1]) / 2
    return threshold


###############################################################################
#
# Main
#
###############################################################################

parser = argparse.ArgumentParser(description='LASER: tools for BUCC bitext mining')
parser.add_argument('--encoding', default='utf-8',
    help='character encoding for input/output')
parser.add_argument('--src-lang', required=True,
    help='the source language id')
parser.add_argument('--trg-lang', required=True,
    help='the target language id')
parser.add_argument('--bucc-texts', required=True,
    help='Base name of the text files (language added)')
parser.add_argument('--bucc-ids', required=True,
    help='Base name of the ID files (language added)')
parser.add_argument('--candidates', required=True,
    help='File name of candidate alignments')
parser.add_argument('--gold', default=None,
    help='File name of gold alignments')
parser.add_argument('--threshold', type=float, default=-1,
    help='Threshold (used with --output)')
parser.add_argument('--output', default=None,
    help='File name of output alignments which are below threshold')
parser.add_argument('--verbose', action='store_true',
    help='Detailed output')
args = parser.parse_args()

print('LASER: tools for BUCC bitext mining')

assert (args.gold or args.threshold > 0) \
       and not (args.gold and args.threshold > 0), \
       'Either "--gold" or "--threshold" must be specified'
if args.verbose:
    print(' - reading sentences and IDs')

src_sent2id, trg_sent2id = {}, {}
for lang, sent2id in (args.src_lang, src_sent2id), (args.trg_lang, trg_sent2id):
    repeated = set()
    with open(args.bucc_texts + '.' + lang, encoding=args.encoding, errors='surrogateescape') as f:
        sentences = [line.strip() for line in f]
    with open(args.bucc_ids + '.' + lang, encoding=args.encoding, errors='surrogateescape') as f:
        ids = [line.strip() for line in f]
    for id, sent in zip(ids, sentences):
        if sent in sent2id:
            repeated.add(sent)
        else:
            sent2id[sent] = id
    for sent in repeated:
        del sent2id[sent]

if args.verbose:
    print(' - reading candidates {}'.format(args.candidates))
candidate2score = {}
# id2txt = {}
with open(args.candidates, encoding=args.encoding, errors='surrogateescape') as f:
    for line in f:
        score, src, trg = line.split('\t')
        score = float(score)
        src = src.strip()
        trg = trg.strip()
        if src in src_sent2id and trg in trg_sent2id:
            src_id = src_sent2id[src]
            trg_id = trg_sent2id[trg]
            score = max(score, candidate2score.get((src_id, trg_id), score))
            candidate2score[(src_id, trg_id)] = score
            # id2txt[src_id + '\t' + trg_id] = src + '\t' + trg

def BuccExtract(cand2score, th, fname):
    if fname:
        of = open(fname, 'w', encoding=args.encoding)
    bitexts = []
    for (src, trg), score in cand2score.items():
        if score >= th:
            bitexts.append(src + '\t' + trg)
            if fname:
                of.write(src + '\t' + trg + '\n')
    if fname:
        of.close()
    return bitexts

if args.gold:
    if args.verbose:
        print(' - optimizing threshold on gold alignments {}'.format(args.gold))
        if args.output:
            print(' - extracted bitext are written into {:s}'.format(args.output))
    gold = {line.strip() for line in open(args.gold)}
    threshold = BuccOptimize(candidate2score, gold)

    bitexts = BuccExtract(candidate2score, threshold, args.output)
    ncorrect = len(gold.intersection(bitexts))
    if ncorrect > 0:
        precision = ncorrect / len(bitexts)
        recall = ncorrect / len(gold)
        f1 = 2*precision*recall / (precision + recall)
    else:
        precision = recall = f1 = 0

    print(' - best threshold={:f}: precision={:.2f}, recall={:.2f}, F1={:.2f}'
          .format(threshold, 100*precision, 100*recall, 100*f1))


if args.threshold > 0:
    if args.verbose:
        print(' - extracting bitexts for threshold {:f} into {:s}'.format(args.threshold, args.output))
    BuccExtract(candidate2score, args.threshold, args.output)