Squashed initial commit

2024-09-10 13:47:29 -04:00
commit 8ebb6ad265
6221 changed files with 2512206 additions and 0 deletions
--- a/qhtcp-workflow/apps/perl/ORF_List_Without_DAmPs.txt
+++ b/qhtcp-workflow/apps/perl/ORF_List_Without_DAmPs.txt
--- a/qhtcp-workflow/apps/perl/analyze_v2.pl
+++ b/qhtcp-workflow/apps/perl/analyze_v2.pl
@@ -0,0 +1,390 @@
+#!/usr/bin/perl
+
+# $Id: analyze.pl,v 1.9 2008/05/14 20:45:37 sherlock Exp $
+
+# Date   : 16th October 2003
+# Author : Gavin Sherlock
+
+# License information (the MIT license)
+
+# Copyright (c) 2003 Gavin Sherlock; Stanford University
+
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+use strict;
+use warnings;
+use diagnostics;
+
+use Data::Dumper;
+
+use Getopt::Long;
+
+use IO::File;
+
+use GO::TermFinder;
+use GO::AnnotationProvider::AnnotationParser;
+use GO::OntologyProvider::OboParser;
+
+use GO::TermFinderReport::Text;
+
+use GO::Utils::File    qw (GenesFromFile);
+use GO::Utils::General qw (CategorizeGenes);
+
+$|=1;
+
+###################################################################################
+sub Usage{
+###################################################################################
+
+    my $message = shift;
+
+    if (defined $message){
+
+	print $message, "\n";
+
+    }
+
+    print <<USAGE;
+
+This program takes a list of files, each of which contain a list of
+genes, with one gene per line.  It will findTerms for the lists of
+genes in each of the GO aspects, outputting the results to a file
+named for the original file, but with a .terms extension.  It will only
+output terms with a corrected P-value of <= 0.05.
+
+It will use the first supplied argument as the annotation file, the
+second argument as the expected number of genes within the organism,
+the third argument is the name of the obo file, and all subsequent
+files as ones containing lists of genes.
+
+Usage:
+
+analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
+
+e.g.
+
+analyze.pl -a ../t/gene_association.sgd -n 7200 -o ../t/gene_ontology_edit.obo genes.txt genes2.txt
+
+USAGE
+
+    exit;
+
+}
+
+# we need at least 3 arguments, an annotation file, the number of
+# genes in the genome, and a file of input genes to test
+
+&Usage if (@ARGV < 3);
+
+# now get our annotation file and number of genes
+
+my $annotationFile = '';
+my $totalNum       = '';
+my $oboFile        = '';
+my $background	   = '';
+my $aspect	   = '';
+
+GetOptions(	"annotations=s"	=> \$annotationFile,
+		"obofile=s"	=> \$oboFile,
+		"background=s"	=> \$background,
+		"numGenes=i"	=> \$totalNum,
+		"aspect=s"	=> \$aspect
+	);
+
+if ($oboFile !~ /\.obo$/){
+
+    # require the obo file to have a .obo extension
+
+    &Usage("Your obo file does not have a .obo extension.");
+
+}
+
+if ($annotationFile !~ /\.sgd$/){
+	&Usage("Perhaps we are missing an annotation file.");
+}
+
+my @population = ();
+if ($background) {
+	@population = GenesFromFile($background)
+}
+
+# now set up the objects we need
+
+my $process   = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
+						     aspect       => 'P');
+my $component = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
+						     aspect       => 'C');
+my $function  = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
+						     aspect       => 'F');
+
+my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile);
+
+my @termFinders = ();
+
+if ($background) {
+	if ($aspect =~ /^P$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+		   				   ontologyProvider  => $process,
+						   population	     => \@population,
+						   aspect            => 'P');
+	}
+	
+	if ($aspect =~ /^C$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $component,
+						   population	     => \@population,
+						   aspect            => 'C');
+	}
+	
+	if ($aspect =~ /^F$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $function,
+						   population	     => \@population,
+						   aspect            => 'F');
+	}
+} else {
+	if ($aspect =~ /^P$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+		   				   ontologyProvider  => $process,
+						   totalNumGenes     => $totalNum,
+						   aspect            => 'P');
+	}
+	
+	if ($aspect =~ /^C$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $component,
+						   totalNumGenes     => $totalNum,
+						   aspect            => 'C');
+	}
+	
+	if ($aspect =~ /^F$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $function,
+						   totalNumGenes     => $totalNum,
+						   aspect            => 'F');
+	}
+}
+
+
+my $report = GO::TermFinderReport::Text->new();
+
+my $cutoff = 0.1;
+
+# now go through each file
+
+foreach my $file (@ARGV){
+
+    print "Analyzing $file\n";
+
+    my @genes = GenesFromFile($file); 
+
+    my (@list, @notFound, @ambiguous);
+
+    CategorizeGenes(annotation  => $annotation,
+		    genes       => \@genes,
+		    ambiguous   => \@ambiguous,
+		    unambiguous => \@list,
+		    notFound    => \@notFound);
+
+    my $outfile = $file.".terms";
+
+    my $fh = IO::File->new($outfile, q{>} )|| die "Cannot make $outfile : $!";
+
+    print "Results being put in $outfile\n";
+
+    if (@list){
+
+	print $fh "The following gene(s) will be considered:\n\n";
+
+	foreach my $gene (@list){
+
+	    print $fh $gene, "\t", $annotation->standardNameByName($gene), "\n";
+
+	}
+
+	print $fh "\n";
+
+    }else{
+
+	print $fh "None of the gene names were recognized\n";
+	print $fh "They were:\n\n";
+	print $fh join("\n", @notFound), "\n";
+	$fh->close;
+
+	next;
+
+    }
+
+    if (@ambiguous){
+
+	# note, some of these ambiguous names would be perfectly fine
+	# if put into GO::TermFinder if they are also standard names.
+	# Currently the behavior of analyze.pl differs from the
+	# default behavior of GO::TermFinder
+
+	print $fh "The following gene(s) are ambiguously named, and so will not be used:\n";
+	print $fh join("\n", @ambiguous), "\n\n";
+
+    }
+
+    if (@notFound){
+
+	print $fh "The following gene(s) were not recognized, and will not be considered:\n\n";
+	print $fh join("\n", @notFound), "\n\n";
+
+    }
+
+    foreach my $termFinder (@termFinders){
+
+	# it's possible that the supplied number of genes on the
+	# command line was less than indicated by the annotation
+	# provider, and thus the TermFinder may have used a larger
+	# number than was entered on the command line.
+
+	my $totalNumGenesUsedInBackground = $termFinder->totalNumGenes;
+
+	print $fh "Finding terms for ", $termFinder->aspect, "\n\n";
+
+	my @pvalues = $termFinder->findTerms(genes => \@list, calculateFDR => 1);
+  if($#pvalues == 0) {
+    print "WARNIING: NO p-value structures returned by findTerms(";
+    print join ",", @list;
+    print ")\n";
+  	print $fh "\n\n";
+    $fh->close;
+    exit();
+  }
+	my $numHypotheses = $report->print(pvalues  => \@pvalues,
+					   numGenes => scalar(@list),
+					   totalNum => $totalNumGenesUsedInBackground,
+					   cutoff   => $cutoff,
+					   fh       => $fh);
+
+  my $numProcesses = $#pvalues + 1;
+  print "Number of GO processes found: $numProcesses\n";
+  print "Number of hypotheses passed cutoff: $numHypotheses\n";
+
+	# if they had no significant P-values
+
+	if ($numHypotheses == 0){
+
+	    print $fh "No terms were found for this aspect with a corrected P-value <= $cutoff.\n";
+
+	}
+
+	print $fh "\n\n";
+
+    }
+
+    $fh->close;    
+    
+}
+
+=pod
+
+=head1 NAME
+
+analyze.pl - batch processor to find terms for lists of genes in various files
+
+=head1 SYNOPSIS
+
+This program takes a list of files, each of which contain a list of
+genes, with one gene per line.  It will findTerms for the lists of
+genes in each of the GO aspects, outputting the results to a file
+named for the original file, but with a .terms extension.  It will
+only output terms with a corrected P-value of <= 0.05.
+
+It will use the first supplied argument as the annotation file, the
+second argument as the expected number of genes within the organism,
+the third argument is the name of the obo file, and all subsequent
+files as ones containing lists of genes.
+
+Usage:
+
+    analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
+
+e.g.
+
+    analyze.pl ../t/gene_association.sgd 7200 ../t/gene_ontology_edit.obo genes.txt genes2.txt
+
+An example output file might look like this:
+
+    The following gene(s) will be considered:
+    
+    YDL235C YPD1
+    YDL224C WHI4
+    YDL225W SHS1
+    YDL226C GCS1
+    YDL227C HO
+    YDL228C YDL228C
+    YDL229W SSB1
+    YDL230W PTP1
+    YDL231C BRE4
+    YDL232W OST4
+    YDL233W YDL233W
+    YDL234C GYP7
+    
+    Finding terms for P
+    
+    
+    Finding terms for C
+    
+    
+    Finding terms for F
+    
+    -- 1 of 15--
+    GOID    GO:0005096
+    TERM    GTPase activator activity
+    CORRECTED P-VALUE       0.0113038452336839
+    UNCORRECTED P-VALUE     0.00113038452336839
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 31 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+    -- 2 of 15--
+    GOID    GO:0008047
+    TERM    enzyme activator activity
+    CORRECTED P-VALUE       0.0316194107645226
+    UNCORRECTED P-VALUE     0.00316194107645226
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 52 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+    -- 3 of 15--
+    GOID    GO:0005083
+    TERM    small GTPase regulatory/interacting protein activity
+    CORRECTED P-VALUE       0.0340606972468798
+    UNCORRECTED P-VALUE     0.00340606972468798
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 54 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+    -- 4 of 15--
+    GOID    GO:0030695
+    TERM    GTPase regulator activity
+    CORRECTED P-VALUE       0.0475469908576535
+    UNCORRECTED P-VALUE     0.00475469908576535
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 64 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+
+=head1 AUTHORS
+
+Gavin Sherlock, sherlock@genome.stanford.edu
+
+=cut
--- a/qhtcp-workflow/apps/perl/gene_association.README
+++ b/qhtcp-workflow/apps/perl/gene_association.README
@@ -0,0 +1,48 @@
+gene_association.sgd.gz    This file is TAB delimited and contains all GO annotations for yeast genes (protein and RNA) 
+
+The gene_association.sgd.gz file uses the standard file format for
+gene_association files of the Gene Ontology (GO) Consortium.  A more
+complete description of the file format is found here:
+
+http://www.geneontology.org/GO.format.annotation.shtml
+
+Columns are:					Contents:
+
+ 1) DB						- database contributing the file (always "SGD" for this file)
+ 2) DB_Object_ID				- SGDID
+ 3) DB_Object_Symbol				- see below
+ 4) NOT 			(optional)	- 'NOT', 'contributes_to', or 'colocalizes_with' qualifier for a GO annotation, when needed
+ 5) GO ID					- unique numeric identifier for the GO term
+ 6) DB:Reference(|DB:Reference)			- the reference associated with the GO annotation
+ 7) Evidence					- the evidence code for the GO annotation
+ 8) With (or) From 		(optional)	- any With or From qualifier for the GO annotation
+ 9) Aspect					- which ontology the GO term belongs in
+10) DB_Object_Name(|Name) 	(optional)	- a name for the gene product in words, e.g. 'acid phosphatase'
+11) DB_Object_Synonym(|Synonym) (optional)	- see below
+12) DB_Object_Type				- type of object annotated, e.g. gene, protein, etc.
+13) taxon(|taxon)				- taxonomic identifier of species encoding gene product
+14) Date					- date GO annotation was made
+15) Assigned_by					- source of the annotation (e.g. SGD, UniProtKB, YeastFunc, bioPIXIE_MEFIT)
+
+Note on SGD nomenclature (pertaining to columns 3 and 11):
+
+Column 3 - When a Standard Gene Name (e.g. CDC28, COX2) has been
+        conferred, it will be present in Column 3. When no Gene Name
+        has been conferred, the Systematic Name (e.g. YAL001C,
+        YGR116W, YAL034W-A) will be present in column 3.
+
+Column 11 - The Systematic Name (e.g. YAL001C, YGR116W, YAL034W-A,
+	Q0010) will be the first name present in Column 11. Any other
+	names (except the Standard Name, which will be in Column 3 if
+	one exists), including Aliases used for the gene will also be
+	present in this column.
+
+Please note that ORFs classified as 'Dubious' are not included in this file, as there is currently 
+no experimental evidence that a gene product is produced in S. cerevisiae.
+
+This file is updated weekly. 
+
+For more information on the Gene Ontology (GO) project, see: 
+
+  http://www.geneontology.org/
+
--- a/qhtcp-workflow/apps/perl/gene_association.sgd
+++ b/qhtcp-workflow/apps/perl/gene_association.sgd
--- a/qhtcp-workflow/apps/perl/gene_ontology_edit.obo
+++ b/qhtcp-workflow/apps/perl/gene_ontology_edit.obo
--- a/qhtcp-workflow/apps/perl/terms2tsv.pl
+++ b/qhtcp-workflow/apps/perl/terms2tsv.pl
@@ -0,0 +1,54 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use diagnostics;
+use File::Map qw(map_file);
+
+my $infile = shift;
+
+my $input;
+
+map_file $input, $infile;
+
+{
+	local $_	= $input;
+	(my $f		= $infile) =~ s/(.*\/)?(.*)(\.[^\.]*){2}/$2/;
+	my %orfgene	= (/(Y\w+)\s+(\w+)\n/g);
+	my @indices	= (/\Q-- \E(\d+) of \d+\Q --\E/g);
+	my @ids		= (/GOID\s+GO:(\d+)/g);
+	my @terms	= (/TERM\s+(.*?)\n/g);
+	my @pvalues	= (/\nCORRECTED P-VALUE\s+(\d.*?)\n/g);
+	my @clusterf	= (/NUM_ANNOTATIONS\s+(\d+ of \d+)/g);
+	my @bgfreq	= (/, vs (\d+ of \d+) in the genome/g);
+	my @orfs	= (/The genes annotated to this node are:\n(.*?)\n/g);
+	
+	s/, /:/g for @orfs;
+
+	my @genes;
+	for my $orf (@orfs) {
+		my @otmp = split /:/, $orf;
+		my @gtmp = map { $orfgene{$_} } @otmp;
+		push @genes, (join ':', @gtmp);
+	}
+
+	&header();
+	for my $i (0 .. (@ids - 1)) {
+		&report($f, $ids[$i], $terms[$i], $pvalues[$i], $clusterf[$i], $bgfreq[$i], $orfs[$i], $genes[$i]);
+	}
+}
+
+sub header {
+	print "REMc ID\tGO_term ID\tGO-term\tCluster frequency\tBackground frequency\tP-value\tORFs\tGenes\n";
+}
+
+sub report {
+	my ($f, $id, $term, $p, $cfreq, $bgfreq, $orfs, $genes) = @_;
+
+	$cfreq =~ /(\d+) of (\d+)/;
+	$cfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
+
+	$bgfreq =~ /(\d+) of (\d+)/;
+	$bgfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
+
+	print "$f\t$id\t$term\t$cfreq\t$bgfreq\t$p\t$orfs\t$genes\n";
+}