terms2tsv.pl 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. #!/usr/bin/env perl
  2. use strict;
  3. use warnings;
  4. use diagnostics;
  5. use File::Map qw(map_file);
  6. my $infile = shift;
  7. my $input;
  8. map_file $input, $infile;
  9. {
  10. local $_ = $input;
  11. (my $f = $infile) =~ s/(.*\/)?(.*)(\.[^\.]*){2}/$2/;
  12. my %orfgene = (/(Y\w+)\s+(\w+)\n/g);
  13. my @indices = (/\Q-- \E(\d+) of \d+\Q --\E/g);
  14. my @ids = (/GOID\s+GO:(\d+)/g);
  15. my @terms = (/TERM\s+(.*?)\n/g);
  16. my @pvalues = (/\nCORRECTED P-VALUE\s+(\d.*?)\n/g);
  17. my @clusterf = (/NUM_ANNOTATIONS\s+(\d+ of \d+)/g);
  18. my @bgfreq = (/, vs (\d+ of \d+) in the genome/g);
  19. my @orfs = (/The genes annotated to this node are:\n(.*?)\n/g);
  20. s/, /:/g for @orfs;
  21. my @genes;
  22. for my $orf (@orfs) {
  23. my @otmp = split /:/, $orf;
  24. my @gtmp = map { $orfgene{$_} } @otmp;
  25. push @genes, (join ':', @gtmp);
  26. }
  27. &header();
  28. for my $i (0 .. (@ids - 1)) {
  29. &report($f, $ids[$i], $terms[$i], $pvalues[$i], $clusterf[$i], $bgfreq[$i], $orfs[$i], $genes[$i]);
  30. }
  31. }
  32. sub header {
  33. print "REMc ID\tGO_term ID\tGO-term\tCluster frequency\tBackground frequency\tP-value\tORFs\tGenes\n";
  34. }
  35. sub report {
  36. my ($f, $id, $term, $p, $cfreq, $bgfreq, $orfs, $genes) = @_;
  37. $cfreq =~ /(\d+) of (\d+)/;
  38. $cfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
  39. $bgfreq =~ /(\d+) of (\d+)/;
  40. $bgfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
  41. print "$f\t$id\t$term\t$cfreq\t$bgfreq\t$p\t$orfs\t$genes\n";
  42. }