-
Notifications
You must be signed in to change notification settings - Fork 2
/
code2seq.js
147 lines (121 loc) · 4.89 KB
/
code2seq.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/**
* abstract.js
* A tool for building the sequence sets for NMT.
*/
const fs = require("fs");
const fse = require('fs-extra'); const Vocab = require("./utilities/vocab-factory.js");
const Code2Seq = require("./utilities/abstraction-factory.js");
const MutateTry = require("./utilities/mutation-factory.js");
const esseq = require("./utilities/esseq.js");
/* Set up the command line options. */
var argv = require('yargs')
.usage('Usage: node $0 [options]')
.demandOption(['code', 'topn', 'seq', 'vocab'])
.describe('code', 'The path to the input source code snippets')
.describe('topn', 'The path to the top N words (the non-reserved vocab)')
.describe('seq', 'The path the sequences will be output to')
.describe('vocab', 'The path the vocab will be output to')
.describe('buckets', 'The number of project buckets')
.describe('maxlen', 'The maximum length of the sequence')
.default('buckets', 10)
.help('h')
.alias('h', 'help')
.argv;
if(!fs.existsSync(argv.code)) {
console.log('File not found: %s', argv.code);
process.exit(1);
}
if(!fs.existsSync(argv.topn)) {
console.log('File not found: %s', argv.topn);
process.exit(1);
}
/* Set up the input stream. Each line in the input file contains a JSON object
* representing one commit-file. */
var lineReader = require('readline').createInterface({
input: require('fs').createReadStream(argv.code)
});
let ctr = 0;
let topN = fs.readFileSync(argv.topn, 'utf-8').split("\n"); // Top N non-reserved words
let code2seq = new Code2Seq();
let bucketAssignments = new Map(); // Quickly lookup which bucket a project is assigned to
let vocab = new Set();
/* Process a commit-file. */
lineReader.on('line', function (line) {
/* Read the sequence pairs for this commit-file. */
let comfile;
try { comfile = JSON.parse(line); }
catch(e) { console.log("code2seq error while parsing JSON"); return; }
/* Get the current bucket. */
let bucket = bucketAssignments.get(comfile.projectID);
if(bucket === undefined) {
bucket = Math.floor((Math.random() * 10));
bucketAssignments.set(comfile.projectID, bucket);
}
ctr++;
console.log("%d: %s - %s", ctr, comfile.url, comfile.fileName);
/* Process the code into a sequence of words. */
for(let j = 0; j < comfile.sliceChangePair.length; j++) {
let pair = comfile.sliceChangePair[j];
let beforeAST = pair['before-ast'],
beforeSeq = null;
let afterAST = pair['after-ast'],
aterSeq = null;
let file = null;
/* Build the abstracted sequences. */
code2seq.ast2Seq(afterAST, topN);
/* Convert the AST into a word sequence. */
try { afterSeq = esseq.generate(afterAST) }
catch(e) { console.log("code2seq error while generating sequence from AST"); }
if(afterSeq === null) continue;
/* Update the vocab with the words in the sequence. */
for(let i = 0; i < afterSeq.length; i++) {
vocab.add(afterSeq[i]);
}
/* Is this mutable? If it is then we need to search for and remove inserted
* try statements one by one. */
if(pair.labels.includes("MUTATION_CANDIDATE")) {
let mutateTry = new MutateTry(afterAST), seq = null;
try { seq = mutateTry.getNextMutant(); }
catch (e) { console.log("code2seq error while generating mutant"); }
while(seq !== null) {
file = argv.seq + "-mutant" + bucket;
if(!argv.maxlen || seq.length <= argv.maxlen) {
fs.appendFileSync(file + ".meta", comfile.sourceFileChangeID
+ ',' + comfile.projectID + ',' + comfile.commitID
+ ',' + comfile.fileName + ',' + comfile.url + '\n');
fs.appendFileSync(file + ".seq", seq.join(' ') + "\n");
}
try { seq = mutateTry.getNextMutant(); }
catch(e) { console.log("code2seq error while generating mutant"); }
}
}
/* Store the sequence in a file. We have a few rules to consider:
* 1. Projects get evenly distributed across 10 buckets (for 10-fold cross validation).
* 2. Real-world repairs (type=REPAIR) go into a special evaluation set.
* The evaluation set is split into the same 10 buckets as the training
* data. */
if(pair.labels.length === 0)
file = argv.seq + "-nominal" + bucket;
else if(pair.labels.includes("MUTATION_CANDIDATE"))
file = argv.seq + "-candidate" + bucket;
else if(pair.labels.includes("REPAIR"))
file = argv.seq + "-repair" + bucket;
else
file = argv.seq + "-error" + bucket;
if(!argv.maxlen || afterSeq.length <= argv.maxlen) {
fs.appendFileSync(file + ".meta", comfile.sourceFileChangeID
+ ',' + comfile.projectID + ',' + comfile.commitID
+ ',' + comfile.fileName + ',' + comfile.url + '\n');
fs.appendFileSync(file + ".seq", afterSeq.join(' ') + "\n");
}
}
});
/* Create the vocab file, since the input has finished. */
lineReader.on('close', (input) => {
fse.ensureFileSync(argv.vocab);
fs.writeFileSync(argv.vocab, [...vocab].join("\n"));
fse.ensureFileSync('output/buckets');
for(var [key, value] of bucketAssignments.entries()) {
fs.appendFileSync('output/buckets', key + ',' + value + '\n');
}
});