#!/usr/bin/env perl use strict; use warnings; use diagnostics; use File::Map qw(map_file); my $infile = shift; my $input; map_file $input, $infile; { local $_ = $input; (my $f = $infile) =~ s/(.*\/)?(.*)(\.[^\.]*){2}/$2/; my %orfgene = (/(Y\w+)\s+(\w+)\n/g); my @indices = (/\Q-- \E(\d+) of \d+\Q --\E/g); my @ids = (/GOID\s+GO:(\d+)/g); my @terms = (/TERM\s+(.*?)\n/g); my @pvalues = (/\nCORRECTED P-VALUE\s+(\d.*?)\n/g); my @clusterf = (/NUM_ANNOTATIONS\s+(\d+ of \d+)/g); my @bgfreq = (/, vs (\d+ of \d+) in the genome/g); my @orfs = (/The genes annotated to this node are:\n(.*?)\n/g); s/, /:/g for @orfs; my @genes; for my $orf (@orfs) { my @otmp = split /:/, $orf; my @gtmp = map { $orfgene{$_} } @otmp; push @genes, (join ':', @gtmp); } &header(); for my $i (0 .. (@ids - 1)) { &report($f, $ids[$i], $terms[$i], $pvalues[$i], $clusterf[$i], $bgfreq[$i], $orfs[$i], $genes[$i]); } } sub header { print "REMc ID\tGO_term ID\tGO-term\tCluster frequency\tBackground frequency\tP-value\tORFs\tGenes\n"; } sub report { my ($f, $id, $term, $p, $cfreq, $bgfreq, $orfs, $genes) = @_; $cfreq =~ /(\d+) of (\d+)/; $cfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2); $bgfreq =~ /(\d+) of (\d+)/; $bgfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2); print "$f\t$id\t$term\t$cfreq\t$bgfreq\t$p\t$orfs\t$genes\n"; }