123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- #!/usr/bin/perl
- # $Id: analyze.pl,v 1.9 2008/05/14 20:45:37 sherlock Exp $
- # Date : 16th October 2003
- # Author : Gavin Sherlock
- # License information (the MIT license)
- # Copyright (c) 2003 Gavin Sherlock; Stanford University
- # Permission is hereby granted, free of charge, to any person
- # obtaining a copy of this software and associated documentation files
- # (the "Software"), to deal in the Software without restriction,
- # including without limitation the rights to use, copy, modify, merge,
- # publish, distribute, sublicense, and/or sell copies of the Software,
- # and to permit persons to whom the Software is furnished to do so,
- # subject to the following conditions:
- # The above copyright notice and this permission notice shall be
- # included in all copies or substantial portions of the Software.
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- # SOFTWARE.
- use strict;
- use warnings;
- use diagnostics;
- use Data::Dumper;
- use Getopt::Long;
- use IO::File;
- use GO::TermFinder;
- use GO::AnnotationProvider::AnnotationParser;
- use GO::OntologyProvider::OboParser;
- use GO::TermFinderReport::Text;
- use GO::Utils::File qw (GenesFromFile);
- use GO::Utils::General qw (CategorizeGenes);
- $|=1;
- ###################################################################################
- sub Usage{
- ###################################################################################
- my $message = shift;
- if (defined $message){
- print $message, "\n";
- }
- print <<USAGE;
- This program takes a list of files, each of which contain a list of
- genes, with one gene per line. It will findTerms for the lists of
- genes in each of the GO aspects, outputting the results to a file
- named for the original file, but with a .terms extension. It will only
- output terms with a corrected P-value of <= 0.05.
- It will use the first supplied argument as the annotation file, the
- second argument as the expected number of genes within the organism,
- the third argument is the name of the obo file, and all subsequent
- files as ones containing lists of genes.
- Usage:
- analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
- e.g.
- analyze.pl -a ../t/gene_association.sgd -n 7200 -o ../t/gene_ontology_edit.obo genes.txt genes2.txt
- USAGE
- exit;
- }
- # we need at least 3 arguments, an annotation file, the number of
- # genes in the genome, and a file of input genes to test
- &Usage if (@ARGV < 3);
- # now get our annotation file and number of genes
- my $annotationFile = '';
- my $totalNum = '';
- my $oboFile = '';
- my $background = '';
- my $aspect = '';
- GetOptions( "annotations=s" => \$annotationFile,
- "obofile=s" => \$oboFile,
- "background=s" => \$background,
- "numGenes=i" => \$totalNum,
- "aspect=s" => \$aspect
- );
- if ($oboFile !~ /\.obo$/){
- # require the obo file to have a .obo extension
- &Usage("Your obo file does not have a .obo extension.");
- }
- if ($annotationFile !~ /\.sgd$/){
- &Usage("Perhaps we are missing an annotation file.");
- }
- my @population = ();
- if ($background) {
- @population = GenesFromFile($background)
- }
- # now set up the objects we need
- my $process = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
- aspect => 'P');
- my $component = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
- aspect => 'C');
- my $function = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
- aspect => 'F');
- my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile);
- my @termFinders = ();
- if ($background) {
- if ($aspect =~ /^P$|^$/) {
- push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
- ontologyProvider => $process,
- population => \@population,
- aspect => 'P');
- }
-
- if ($aspect =~ /^C$|^$/) {
- push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
- ontologyProvider => $component,
- population => \@population,
- aspect => 'C');
- }
-
- if ($aspect =~ /^F$|^$/) {
- push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
- ontologyProvider => $function,
- population => \@population,
- aspect => 'F');
- }
- } else {
- if ($aspect =~ /^P$|^$/) {
- push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
- ontologyProvider => $process,
- totalNumGenes => $totalNum,
- aspect => 'P');
- }
-
- if ($aspect =~ /^C$|^$/) {
- push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
- ontologyProvider => $component,
- totalNumGenes => $totalNum,
- aspect => 'C');
- }
-
- if ($aspect =~ /^F$|^$/) {
- push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
- ontologyProvider => $function,
- totalNumGenes => $totalNum,
- aspect => 'F');
- }
- }
- my $report = GO::TermFinderReport::Text->new();
- my $cutoff = 0.1;
- # now go through each file
- foreach my $file (@ARGV){
- print "Analyzing $file\n";
- my @genes = GenesFromFile($file);
- my (@list, @notFound, @ambiguous);
- CategorizeGenes(annotation => $annotation,
- genes => \@genes,
- ambiguous => \@ambiguous,
- unambiguous => \@list,
- notFound => \@notFound);
- my $outfile = $file.".terms";
- my $fh = IO::File->new($outfile, q{>} )|| die "Cannot make $outfile : $!";
- print "Results being put in $outfile\n";
- if (@list){
- print $fh "The following gene(s) will be considered:\n\n";
- foreach my $gene (@list){
- print $fh $gene, "\t", $annotation->standardNameByName($gene), "\n";
- }
- print $fh "\n";
- }else{
- print $fh "None of the gene names were recognized\n";
- print $fh "They were:\n\n";
- print $fh join("\n", @notFound), "\n";
- $fh->close;
- next;
- }
- if (@ambiguous){
- # note, some of these ambiguous names would be perfectly fine
- # if put into GO::TermFinder if they are also standard names.
- # Currently the behavior of analyze.pl differs from the
- # default behavior of GO::TermFinder
- print $fh "The following gene(s) are ambiguously named, and so will not be used:\n";
- print $fh join("\n", @ambiguous), "\n\n";
- }
- if (@notFound){
- print $fh "The following gene(s) were not recognized, and will not be considered:\n\n";
- print $fh join("\n", @notFound), "\n\n";
- }
- foreach my $termFinder (@termFinders){
- # it's possible that the supplied number of genes on the
- # command line was less than indicated by the annotation
- # provider, and thus the TermFinder may have used a larger
- # number than was entered on the command line.
- my $totalNumGenesUsedInBackground = $termFinder->totalNumGenes;
- print $fh "Finding terms for ", $termFinder->aspect, "\n\n";
- my @pvalues = $termFinder->findTerms(genes => \@list, calculateFDR => 1);
- if($#pvalues == 0) {
- print "WARNIING: NO p-value structures returned by findTerms(";
- print join ",", @list;
- print ")\n";
- print $fh "\n\n";
- $fh->close;
- exit();
- }
- my $numHypotheses = $report->print(pvalues => \@pvalues,
- numGenes => scalar(@list),
- totalNum => $totalNumGenesUsedInBackground,
- cutoff => $cutoff,
- fh => $fh);
- my $numProcesses = $#pvalues + 1;
- print "Number of GO processes found: $numProcesses\n";
- print "Number of hypotheses passed cutoff: $numHypotheses\n";
- # if they had no significant P-values
- if ($numHypotheses == 0){
- print $fh "No terms were found for this aspect with a corrected P-value <= $cutoff.\n";
- }
- print $fh "\n\n";
- }
- $fh->close;
-
- }
- =pod
- =head1 NAME
- analyze.pl - batch processor to find terms for lists of genes in various files
- =head1 SYNOPSIS
- This program takes a list of files, each of which contain a list of
- genes, with one gene per line. It will findTerms for the lists of
- genes in each of the GO aspects, outputting the results to a file
- named for the original file, but with a .terms extension. It will
- only output terms with a corrected P-value of <= 0.05.
- It will use the first supplied argument as the annotation file, the
- second argument as the expected number of genes within the organism,
- the third argument is the name of the obo file, and all subsequent
- files as ones containing lists of genes.
- Usage:
- analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
- e.g.
- analyze.pl ../t/gene_association.sgd 7200 ../t/gene_ontology_edit.obo genes.txt genes2.txt
- An example output file might look like this:
- The following gene(s) will be considered:
-
- YDL235C YPD1
- YDL224C WHI4
- YDL225W SHS1
- YDL226C GCS1
- YDL227C HO
- YDL228C YDL228C
- YDL229W SSB1
- YDL230W PTP1
- YDL231C BRE4
- YDL232W OST4
- YDL233W YDL233W
- YDL234C GYP7
-
- Finding terms for P
-
-
- Finding terms for C
-
-
- Finding terms for F
-
- -- 1 of 15--
- GOID GO:0005096
- TERM GTPase activator activity
- CORRECTED P-VALUE 0.0113038452336839
- UNCORRECTED P-VALUE 0.00113038452336839
- NUM_ANNOTATIONS 2 of 12 in the list, vs 31 of 7272 in the genome
- The genes annotated to this node are:
- YDL234C, YDL226C
- -- 2 of 15--
- GOID GO:0008047
- TERM enzyme activator activity
- CORRECTED P-VALUE 0.0316194107645226
- UNCORRECTED P-VALUE 0.00316194107645226
- NUM_ANNOTATIONS 2 of 12 in the list, vs 52 of 7272 in the genome
- The genes annotated to this node are:
- YDL234C, YDL226C
- -- 3 of 15--
- GOID GO:0005083
- TERM small GTPase regulatory/interacting protein activity
- CORRECTED P-VALUE 0.0340606972468798
- UNCORRECTED P-VALUE 0.00340606972468798
- NUM_ANNOTATIONS 2 of 12 in the list, vs 54 of 7272 in the genome
- The genes annotated to this node are:
- YDL234C, YDL226C
- -- 4 of 15--
- GOID GO:0030695
- TERM GTPase regulator activity
- CORRECTED P-VALUE 0.0475469908576535
- UNCORRECTED P-VALUE 0.00475469908576535
- NUM_ANNOTATIONS 2 of 12 in the list, vs 64 of 7272 in the genome
- The genes annotated to this node are:
- YDL234C, YDL226C
- =head1 AUTHORS
- Gavin Sherlock, sherlock@genome.stanford.edu
- =cut
|