-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastaExtractrRNA.pl
executable file
·66 lines (59 loc) · 2.28 KB
/
fastaExtractrRNA.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/perl
# this script takes a fasta file with contigs and the output of one Barrnap run to produce a fasta file with rRNA sequences and a .tab file in the style of prodigal
# 4 inputs: - the fasta file of the contigs
# - the .gff file from Barrnap
# - the name of the .tab output
# - the name of the fasta output
# 2 outputs: - a fasta file with the rRNA gene sequences, naming is the contig name appended with _r and a continuous number per contig
# - a table with contig name rRNA gene name (see above), the sense, length, start position, end position, completeness and kind (16S, 5S etc)
# Anna Heintz-Buschart, November 2014
use strict;
use Bio::DB::Fasta;
my $fastaFile = shift;
my $bacFile = shift;
my $listFile = shift;
my $geneFile = shift;
my %allContigs = ();
my $db = Bio::DB::Fasta->new( $fastaFile );
open (IN, $bacFile);
open(GEN, ">", $geneFile) or die "cannot open $geneFile \n";
open(TAB, ">", $listFile) or die "cannot open $listFile \n";
print TAB "contig\tgene\tsense\tlength\tstart\tend\tcompleteness\tkind\n";
while (my $line = <IN>){
unless ($line =~ /^#/) {
chomp $line;
my($contig, undef, undef, $start, $end, undef, $sense, undef, $attribute) = split "\t", $line;
my $completeness = "";
if ($attribute =~ /partial/){
$completeness = "incomplete"
} else {
$completeness = "complete"
}
if (exists $allContigs{$contig}) {
$allContigs{$contig}++
} else {
$allContigs{$contig} = 1
}
my $gene = join("", $contig, "_r", $allContigs{$contig});
my $length = 1 + $end - $start;
my @attributes = split ";", $attribute,2;
my $kind = $attributes[0];
substr($kind, 0 , 5) = "";
print TAB join("\t",$contig,$gene,$sense,$length,$start,$end,$completeness,$kind), "\n";
my $sequence = "";
if ($sense == "+") {
$sequence = $db->seq($contig, $start, $end );
} else {
$sequence = $db->seq($contig, $end, $start);
}
if (!defined( $sequence )) {
print STDERR "Sequence $contig not found. \n";
next;
} else {
print GEN ">$gene\n", "$sequence\n";
}
}
}
close(IN);
close(GEN);
close(TAB);