analyze_v2.pl 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. #!/usr/bin/perl
  2. # $Id: analyze.pl,v 1.9 2008/05/14 20:45:37 sherlock Exp $
  3. # Date : 16th October 2003
  4. # Author : Gavin Sherlock
  5. # License information (the MIT license)
  6. # Copyright (c) 2003 Gavin Sherlock; Stanford University
  7. # Permission is hereby granted, free of charge, to any person
  8. # obtaining a copy of this software and associated documentation files
  9. # (the "Software"), to deal in the Software without restriction,
  10. # including without limitation the rights to use, copy, modify, merge,
  11. # publish, distribute, sublicense, and/or sell copies of the Software,
  12. # and to permit persons to whom the Software is furnished to do so,
  13. # subject to the following conditions:
  14. # The above copyright notice and this permission notice shall be
  15. # included in all copies or substantial portions of the Software.
  16. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  20. # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  21. # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22. # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23. # SOFTWARE.
  24. use strict;
  25. use warnings;
  26. use diagnostics;
  27. use Data::Dumper;
  28. use Getopt::Long;
  29. use IO::File;
  30. use GO::TermFinder;
  31. use GO::AnnotationProvider::AnnotationParser;
  32. use GO::OntologyProvider::OboParser;
  33. use GO::TermFinderReport::Text;
  34. use GO::Utils::File qw (GenesFromFile);
  35. use GO::Utils::General qw (CategorizeGenes);
  36. $|=1;
  37. ###################################################################################
  38. sub Usage{
  39. ###################################################################################
  40. my $message = shift;
  41. if (defined $message){
  42. print $message, "\n";
  43. }
  44. print <<USAGE;
  45. This program takes a list of files, each of which contain a list of
  46. genes, with one gene per line. It will findTerms for the lists of
  47. genes in each of the GO aspects, outputting the results to a file
  48. named for the original file, but with a .terms extension. It will only
  49. output terms with a corrected P-value of <= 0.05.
  50. It will use the first supplied argument as the annotation file, the
  51. second argument as the expected number of genes within the organism,
  52. the third argument is the name of the obo file, and all subsequent
  53. files as ones containing lists of genes.
  54. Usage:
  55. analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
  56. e.g.
  57. analyze.pl -a ../t/gene_association.sgd -n 7200 -o ../t/gene_ontology_edit.obo genes.txt genes2.txt
  58. USAGE
  59. exit;
  60. }
  61. # we need at least 3 arguments, an annotation file, the number of
  62. # genes in the genome, and a file of input genes to test
  63. &Usage if (@ARGV < 3);
  64. # now get our annotation file and number of genes
  65. my $annotationFile = '';
  66. my $totalNum = '';
  67. my $oboFile = '';
  68. my $background = '';
  69. my $aspect = '';
  70. GetOptions( "annotations=s" => \$annotationFile,
  71. "obofile=s" => \$oboFile,
  72. "background=s" => \$background,
  73. "numGenes=i" => \$totalNum,
  74. "aspect=s" => \$aspect
  75. );
  76. if ($oboFile !~ /\.obo$/){
  77. # require the obo file to have a .obo extension
  78. &Usage("Your obo file does not have a .obo extension.");
  79. }
  80. if ($annotationFile !~ /\.sgd$/){
  81. &Usage("Perhaps we are missing an annotation file.");
  82. }
  83. my @population = ();
  84. if ($background) {
  85. @population = GenesFromFile($background)
  86. }
  87. # now set up the objects we need
  88. my $process = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
  89. aspect => 'P');
  90. my $component = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
  91. aspect => 'C');
  92. my $function = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
  93. aspect => 'F');
  94. my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile);
  95. my @termFinders = ();
  96. if ($background) {
  97. if ($aspect =~ /^P$|^$/) {
  98. push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
  99. ontologyProvider => $process,
  100. population => \@population,
  101. aspect => 'P');
  102. }
  103. if ($aspect =~ /^C$|^$/) {
  104. push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
  105. ontologyProvider => $component,
  106. population => \@population,
  107. aspect => 'C');
  108. }
  109. if ($aspect =~ /^F$|^$/) {
  110. push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
  111. ontologyProvider => $function,
  112. population => \@population,
  113. aspect => 'F');
  114. }
  115. } else {
  116. if ($aspect =~ /^P$|^$/) {
  117. push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
  118. ontologyProvider => $process,
  119. totalNumGenes => $totalNum,
  120. aspect => 'P');
  121. }
  122. if ($aspect =~ /^C$|^$/) {
  123. push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
  124. ontologyProvider => $component,
  125. totalNumGenes => $totalNum,
  126. aspect => 'C');
  127. }
  128. if ($aspect =~ /^F$|^$/) {
  129. push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
  130. ontologyProvider => $function,
  131. totalNumGenes => $totalNum,
  132. aspect => 'F');
  133. }
  134. }
  135. my $report = GO::TermFinderReport::Text->new();
  136. my $cutoff = 0.1;
  137. # now go through each file
  138. foreach my $file (@ARGV){
  139. print "Analyzing $file\n";
  140. my @genes = GenesFromFile($file);
  141. my (@list, @notFound, @ambiguous);
  142. CategorizeGenes(annotation => $annotation,
  143. genes => \@genes,
  144. ambiguous => \@ambiguous,
  145. unambiguous => \@list,
  146. notFound => \@notFound);
  147. my $outfile = $file.".terms";
  148. my $fh = IO::File->new($outfile, q{>} )|| die "Cannot make $outfile : $!";
  149. print "Results being put in $outfile\n";
  150. if (@list){
  151. print $fh "The following gene(s) will be considered:\n\n";
  152. foreach my $gene (@list){
  153. print $fh $gene, "\t", $annotation->standardNameByName($gene), "\n";
  154. }
  155. print $fh "\n";
  156. }else{
  157. print $fh "None of the gene names were recognized\n";
  158. print $fh "They were:\n\n";
  159. print $fh join("\n", @notFound), "\n";
  160. $fh->close;
  161. next;
  162. }
  163. if (@ambiguous){
  164. # note, some of these ambiguous names would be perfectly fine
  165. # if put into GO::TermFinder if they are also standard names.
  166. # Currently the behavior of analyze.pl differs from the
  167. # default behavior of GO::TermFinder
  168. print $fh "The following gene(s) are ambiguously named, and so will not be used:\n";
  169. print $fh join("\n", @ambiguous), "\n\n";
  170. }
  171. if (@notFound){
  172. print $fh "The following gene(s) were not recognized, and will not be considered:\n\n";
  173. print $fh join("\n", @notFound), "\n\n";
  174. }
  175. foreach my $termFinder (@termFinders){
  176. # it's possible that the supplied number of genes on the
  177. # command line was less than indicated by the annotation
  178. # provider, and thus the TermFinder may have used a larger
  179. # number than was entered on the command line.
  180. my $totalNumGenesUsedInBackground = $termFinder->totalNumGenes;
  181. print $fh "Finding terms for ", $termFinder->aspect, "\n\n";
  182. my @pvalues = $termFinder->findTerms(genes => \@list, calculateFDR => 1);
  183. if($#pvalues == 0) {
  184. print "WARNIING: NO p-value structures returned by findTerms(";
  185. print join ",", @list;
  186. print ")\n";
  187. print $fh "\n\n";
  188. $fh->close;
  189. exit();
  190. }
  191. my $numHypotheses = $report->print(pvalues => \@pvalues,
  192. numGenes => scalar(@list),
  193. totalNum => $totalNumGenesUsedInBackground,
  194. cutoff => $cutoff,
  195. fh => $fh);
  196. my $numProcesses = $#pvalues + 1;
  197. print "Number of GO processes found: $numProcesses\n";
  198. print "Number of hypotheses passed cutoff: $numHypotheses\n";
  199. # if they had no significant P-values
  200. if ($numHypotheses == 0){
  201. print $fh "No terms were found for this aspect with a corrected P-value <= $cutoff.\n";
  202. }
  203. print $fh "\n\n";
  204. }
  205. $fh->close;
  206. }
  207. =pod
  208. =head1 NAME
  209. analyze.pl - batch processor to find terms for lists of genes in various files
  210. =head1 SYNOPSIS
  211. This program takes a list of files, each of which contain a list of
  212. genes, with one gene per line. It will findTerms for the lists of
  213. genes in each of the GO aspects, outputting the results to a file
  214. named for the original file, but with a .terms extension. It will
  215. only output terms with a corrected P-value of <= 0.05.
  216. It will use the first supplied argument as the annotation file, the
  217. second argument as the expected number of genes within the organism,
  218. the third argument is the name of the obo file, and all subsequent
  219. files as ones containing lists of genes.
  220. Usage:
  221. analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
  222. e.g.
  223. analyze.pl ../t/gene_association.sgd 7200 ../t/gene_ontology_edit.obo genes.txt genes2.txt
  224. An example output file might look like this:
  225. The following gene(s) will be considered:
  226. YDL235C YPD1
  227. YDL224C WHI4
  228. YDL225W SHS1
  229. YDL226C GCS1
  230. YDL227C HO
  231. YDL228C YDL228C
  232. YDL229W SSB1
  233. YDL230W PTP1
  234. YDL231C BRE4
  235. YDL232W OST4
  236. YDL233W YDL233W
  237. YDL234C GYP7
  238. Finding terms for P
  239. Finding terms for C
  240. Finding terms for F
  241. -- 1 of 15--
  242. GOID GO:0005096
  243. TERM GTPase activator activity
  244. CORRECTED P-VALUE 0.0113038452336839
  245. UNCORRECTED P-VALUE 0.00113038452336839
  246. NUM_ANNOTATIONS 2 of 12 in the list, vs 31 of 7272 in the genome
  247. The genes annotated to this node are:
  248. YDL234C, YDL226C
  249. -- 2 of 15--
  250. GOID GO:0008047
  251. TERM enzyme activator activity
  252. CORRECTED P-VALUE 0.0316194107645226
  253. UNCORRECTED P-VALUE 0.00316194107645226
  254. NUM_ANNOTATIONS 2 of 12 in the list, vs 52 of 7272 in the genome
  255. The genes annotated to this node are:
  256. YDL234C, YDL226C
  257. -- 3 of 15--
  258. GOID GO:0005083
  259. TERM small GTPase regulatory/interacting protein activity
  260. CORRECTED P-VALUE 0.0340606972468798
  261. UNCORRECTED P-VALUE 0.00340606972468798
  262. NUM_ANNOTATIONS 2 of 12 in the list, vs 54 of 7272 in the genome
  263. The genes annotated to this node are:
  264. YDL234C, YDL226C
  265. -- 4 of 15--
  266. GOID GO:0030695
  267. TERM GTPase regulator activity
  268. CORRECTED P-VALUE 0.0475469908576535
  269. UNCORRECTED P-VALUE 0.00475469908576535
  270. NUM_ANNOTATIONS 2 of 12 in the list, vs 64 of 7272 in the genome
  271. The genes annotated to this node are:
  272. YDL234C, YDL226C
  273. =head1 AUTHORS
  274. Gavin Sherlock, sherlock@genome.stanford.edu
  275. =cut