123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- #!/usr/bin/env perl
- use strict;
- use warnings;
- use diagnostics;
- use File::Map qw(map_file);
- my $infile = shift;
- my $input;
- map_file $input, $infile;
- {
- local $_ = $input;
- (my $f = $infile) =~ s/(.*\/)?(.*)(\.[^\.]*){2}/$2/;
- my %orfgene = (/(Y\w+)\s+(\w+)\n/g);
- my @indices = (/\Q-- \E(\d+) of \d+\Q --\E/g);
- my @ids = (/GOID\s+GO:(\d+)/g);
- my @terms = (/TERM\s+(.*?)\n/g);
- my @pvalues = (/\nCORRECTED P-VALUE\s+(\d.*?)\n/g);
- my @clusterf = (/NUM_ANNOTATIONS\s+(\d+ of \d+)/g);
- my @bgfreq = (/, vs (\d+ of \d+) in the genome/g);
- my @orfs = (/The genes annotated to this node are:\n(.*?)\n/g);
-
- s/, /:/g for @orfs;
- my @genes;
- for my $orf (@orfs) {
- my @otmp = split /:/, $orf;
- my @gtmp = map { $orfgene{$_} } @otmp;
- push @genes, (join ':', @gtmp);
- }
- &header();
- for my $i (0 .. (@ids - 1)) {
- &report($f, $ids[$i], $terms[$i], $pvalues[$i], $clusterf[$i], $bgfreq[$i], $orfs[$i], $genes[$i]);
- }
- }
- sub header {
- print "REMc ID\tGO_term ID\tGO-term\tCluster frequency\tBackground frequency\tP-value\tORFs\tGenes\n";
- }
- sub report {
- my ($f, $id, $term, $p, $cfreq, $bgfreq, $orfs, $genes) = @_;
- $cfreq =~ /(\d+) of (\d+)/;
- $cfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
- $bgfreq =~ /(\d+) of (\d+)/;
- $bgfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
- print "$f\t$id\t$term\t$cfreq\t$bgfreq\t$p\t$orfs\t$genes\n";
- }
|