First attempt at script-run-workflow

2024-07-21 23:50:24 -04:00
parent 628356652d
commit 06dd700680
290 changed files with 5524411 additions and 0 deletions
--- a/workflow/templates/qhtcp/REMc/GTF/Process/.DS_Store
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/.DS_Store
--- a/workflow/templates/qhtcp/REMc/GTF/Process/.~lock.PresltDebug.txt#
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/.~lock.PresltDebug.txt#
@@ -0,0 +1 @@
+,jwrodger,hartmanlab.genetics.uab.edu,28.07.2022 09:10,file:///home/jwrodger/.config/libreoffice/4;
--- a/workflow/templates/qhtcp/REMc/GTF/Process/Concatenate_GTF_results.py
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/Concatenate_GTF_results.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+# This code is to concatenate the batch GO Term Finder results (.tsv) generated from batch GTF perl code(Chris Johnson, U of Tulsa) into a list table
+
+import sys, os, string, glob
+
+
+# return the file list
+def list_all_files(Path):
+    list_all_files = []
+    list_all_files = glob.glob(Path +'/*.txt.tsv')
+    return list_all_files
+
+
+# Main function
+
+try:
+   data_file_Path = sys.argv[1]
+   output_file_Path = sys.argv[2]
+except: 
+   print ('Usage: python Concatenate_GTF_results.py /datasetPath /outputFilePath_and_Name')
+   print ('Data file not found, error in given directory')
+   sys.exit(1)
+
+# Open the output file
+
+try:
+    output = open(output_file_Path, 'w')
+except OSError:
+    print ('output file error')
+
+# get all the GTF result files in given directory
+File_list = []
+File_list = list_all_files(data_file_Path)
+File_list.sort()
+
+i = 0
+for file in File_list:
+    #parse the file names given in absolute path
+    file_name = file.strip().split('/')[-1]
+    file_name = file_name.rstrip('.txt.tsv')
+    # function to read tsv files from a given directory
+    #open the file
+    data = open(file,'r')
+    #reading the label line
+    labelLine = data.readline()
+    label = labelLine.strip().split('\t')
+    #write the label
+    #updates2010July26: update following label writing code	
+    if i == 0:
+    #    output.write('cluster origin')
+        for element in label:
+            output.write(element)
+            output.write('\t')
+    i = i + 1
+    #updates2010July26 End
+    #switch to the next line
+    output.write('\n')
+
+    #read the GO terms
+    GOTermLines = data.readlines()
+    for GOTerm in GOTermLines:
+        GOTerm = GOTerm.strip().strip('\t')
+        if GOTerm != '':
+	    #updates2010July26: remove the code to write the first column 'REMc cluster ID'
+            #output.write(file_name)
+            #output.write('\t')
+	    ##updates2010July26 update end
+            output.write(GOTerm + '\n')
+            #output.write('\n')
+output.close()
--- a/workflow/templates/qhtcp/REMc/GTF/Process/ORF_List_DAmPs_Only.txt
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/ORF_List_DAmPs_Only.txt
--- a/workflow/templates/qhtcp/REMc/GTF/Process/ORF_List_Without_DAmPs.txt
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/ORF_List_Without_DAmPs.txt
--- a/workflow/templates/qhtcp/REMc/GTF/Process/ORFs_w_DAmP_list.txt
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/ORFs_w_DAmP_list.txt
--- a/workflow/templates/qhtcp/REMc/GTF/Process/SGD_features.tab
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/SGD_features.tab
--- a/workflow/templates/qhtcp/REMc/GTF/Process/SGD_features.tab.txt
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/SGD_features.tab.txt
--- a/workflow/templates/qhtcp/REMc/GTF/Process/analyze_v2.pl
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/analyze_v2.pl
@@ -0,0 +1,390 @@
+#!/usr/bin/perl
+
+# $Id: analyze.pl,v 1.9 2008/05/14 20:45:37 sherlock Exp $
+
+# Date   : 16th October 2003
+# Author : Gavin Sherlock
+
+# License information (the MIT license)
+
+# Copyright (c) 2003 Gavin Sherlock; Stanford University
+
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+use strict;
+use warnings;
+use diagnostics;
+
+use Data::Dumper;
+
+use Getopt::Long;
+
+use IO::File;
+
+use GO::TermFinder;
+use GO::AnnotationProvider::AnnotationParser;
+use GO::OntologyProvider::OboParser;
+
+use GO::TermFinderReport::Text;
+
+use GO::Utils::File    qw (GenesFromFile);
+use GO::Utils::General qw (CategorizeGenes);
+
+$|=1;
+
+###################################################################################
+sub Usage{
+###################################################################################
+
+    my $message = shift;
+
+    if (defined $message){
+
+	print $message, "\n";
+
+    }
+
+    print <<USAGE;
+
+This program takes a list of files, each of which contain a list of
+genes, with one gene per line.  It will findTerms for the lists of
+genes in each of the GO aspects, outputting the results to a file
+named for the original file, but with a .terms extension.  It will only
+output terms with a corrected P-value of <= 0.05.
+
+It will use the first supplied argument as the annotation file, the
+second argument as the expected number of genes within the organism,
+the third argument is the name of the obo file, and all subsequent
+files as ones containing lists of genes.
+
+Usage:
+
+analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
+
+e.g.
+
+analyze.pl -a ../t/gene_association.sgd -n 7200 -o ../t/gene_ontology_edit.obo genes.txt genes2.txt
+
+USAGE
+
+    exit;
+
+}
+
+# we need at least 3 arguments, an annotation file, the number of
+# genes in the genome, and a file of input genes to test
+
+&Usage if (@ARGV < 3);
+
+# now get our annotation file and number of genes
+
+my $annotationFile = '';
+my $totalNum       = '';
+my $oboFile        = '';
+my $background	   = '';
+my $aspect	   = '';
+
+GetOptions(	"annotations=s"	=> \$annotationFile,
+		"obofile=s"	=> \$oboFile,
+		"background=s"	=> \$background,
+		"numGenes=i"	=> \$totalNum,
+		"aspect=s"	=> \$aspect
+	);
+
+if ($oboFile !~ /\.obo$/){
+
+    # require the obo file to have a .obo extension
+
+    &Usage("Your obo file does not have a .obo extension.");
+
+}
+
+if ($annotationFile !~ /\.sgd$/){
+	&Usage("Perhaps we are missing an annotation file.");
+}
+
+my @population = ();
+if ($background) {
+	@population = GenesFromFile($background)
+}
+
+# now set up the objects we need
+
+my $process   = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
+						     aspect       => 'P');
+my $component = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
+						     aspect       => 'C');
+my $function  = GO::OntologyProvider::OboParser->new(ontologyFile => $oboFile,
+						     aspect       => 'F');
+
+my $annotation = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFile);
+
+my @termFinders = ();
+
+if ($background) {
+	if ($aspect =~ /^P$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+		   				   ontologyProvider  => $process,
+						   population	     => \@population,
+						   aspect            => 'P');
+	}
+	
+	if ($aspect =~ /^C$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $component,
+						   population	     => \@population,
+						   aspect            => 'C');
+	}
+	
+	if ($aspect =~ /^F$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $function,
+						   population	     => \@population,
+						   aspect            => 'F');
+	}
+} else {
+	if ($aspect =~ /^P$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+		   				   ontologyProvider  => $process,
+						   totalNumGenes     => $totalNum,
+						   aspect            => 'P');
+	}
+	
+	if ($aspect =~ /^C$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $component,
+						   totalNumGenes     => $totalNum,
+						   aspect            => 'C');
+	}
+	
+	if ($aspect =~ /^F$|^$/) {
+		push @termFinders, GO::TermFinder->new(annotationProvider=> $annotation,
+						   ontologyProvider  => $function,
+						   totalNumGenes     => $totalNum,
+						   aspect            => 'F');
+	}
+}
+
+
+my $report = GO::TermFinderReport::Text->new();
+
+my $cutoff = 0.1;
+
+# now go through each file
+
+foreach my $file (@ARGV){
+
+    print "Analyzing $file\n";
+
+    my @genes = GenesFromFile($file); 
+
+    my (@list, @notFound, @ambiguous);
+
+    CategorizeGenes(annotation  => $annotation,
+		    genes       => \@genes,
+		    ambiguous   => \@ambiguous,
+		    unambiguous => \@list,
+		    notFound    => \@notFound);
+
+    my $outfile = $file.".terms";
+
+    my $fh = IO::File->new($outfile, q{>} )|| die "Cannot make $outfile : $!";
+
+    print "Results being put in $outfile\n";
+
+    if (@list){
+
+	print $fh "The following gene(s) will be considered:\n\n";
+
+	foreach my $gene (@list){
+
+	    print $fh $gene, "\t", $annotation->standardNameByName($gene), "\n";
+
+	}
+
+	print $fh "\n";
+
+    }else{
+
+	print $fh "None of the gene names were recognized\n";
+	print $fh "They were:\n\n";
+	print $fh join("\n", @notFound), "\n";
+	$fh->close;
+
+	next;
+
+    }
+
+    if (@ambiguous){
+
+	# note, some of these ambiguous names would be perfectly fine
+	# if put into GO::TermFinder if they are also standard names.
+	# Currently the behavior of analyze.pl differs from the
+	# default behavior of GO::TermFinder
+
+	print $fh "The following gene(s) are ambiguously named, and so will not be used:\n";
+	print $fh join("\n", @ambiguous), "\n\n";
+
+    }
+
+    if (@notFound){
+
+	print $fh "The following gene(s) were not recognized, and will not be considered:\n\n";
+	print $fh join("\n", @notFound), "\n\n";
+
+    }
+
+    foreach my $termFinder (@termFinders){
+
+	# it's possible that the supplied number of genes on the
+	# command line was less than indicated by the annotation
+	# provider, and thus the TermFinder may have used a larger
+	# number than was entered on the command line.
+
+	my $totalNumGenesUsedInBackground = $termFinder->totalNumGenes;
+
+	print $fh "Finding terms for ", $termFinder->aspect, "\n\n";
+
+	my @pvalues = $termFinder->findTerms(genes => \@list, calculateFDR => 1);
+  if($#pvalues == 0) {
+    print "WARNIING: NO p-value structures returned by findTerms(";
+    print join ",", @list;
+    print ")\n";
+  	print $fh "\n\n";
+    $fh->close;
+    exit();
+  }
+	my $numHypotheses = $report->print(pvalues  => \@pvalues,
+					   numGenes => scalar(@list),
+					   totalNum => $totalNumGenesUsedInBackground,
+					   cutoff   => $cutoff,
+					   fh       => $fh);
+
+  my $numProcesses = $#pvalues + 1;
+  print "Number of GO processes found: $numProcesses\n";
+  print "Number of hypotheses passed cutoff: $numHypotheses\n";
+
+	# if they had no significant P-values
+
+	if ($numHypotheses == 0){
+
+	    print $fh "No terms were found for this aspect with a corrected P-value <= $cutoff.\n";
+
+	}
+
+	print $fh "\n\n";
+
+    }
+
+    $fh->close;    
+    
+}
+
+=pod
+
+=head1 NAME
+
+analyze.pl - batch processor to find terms for lists of genes in various files
+
+=head1 SYNOPSIS
+
+This program takes a list of files, each of which contain a list of
+genes, with one gene per line.  It will findTerms for the lists of
+genes in each of the GO aspects, outputting the results to a file
+named for the original file, but with a .terms extension.  It will
+only output terms with a corrected P-value of <= 0.05.
+
+It will use the first supplied argument as the annotation file, the
+second argument as the expected number of genes within the organism,
+the third argument is the name of the obo file, and all subsequent
+files as ones containing lists of genes.
+
+Usage:
+
+    analyze.pl <annotation_file> <numGenes> <obofile> <file1> <file2> <file3> ... <fileN>
+
+e.g.
+
+    analyze.pl ../t/gene_association.sgd 7200 ../t/gene_ontology_edit.obo genes.txt genes2.txt
+
+An example output file might look like this:
+
+    The following gene(s) will be considered:
+    
+    YDL235C YPD1
+    YDL224C WHI4
+    YDL225W SHS1
+    YDL226C GCS1
+    YDL227C HO
+    YDL228C YDL228C
+    YDL229W SSB1
+    YDL230W PTP1
+    YDL231C BRE4
+    YDL232W OST4
+    YDL233W YDL233W
+    YDL234C GYP7
+    
+    Finding terms for P
+    
+    
+    Finding terms for C
+    
+    
+    Finding terms for F
+    
+    -- 1 of 15--
+    GOID    GO:0005096
+    TERM    GTPase activator activity
+    CORRECTED P-VALUE       0.0113038452336839
+    UNCORRECTED P-VALUE     0.00113038452336839
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 31 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+    -- 2 of 15--
+    GOID    GO:0008047
+    TERM    enzyme activator activity
+    CORRECTED P-VALUE       0.0316194107645226
+    UNCORRECTED P-VALUE     0.00316194107645226
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 52 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+    -- 3 of 15--
+    GOID    GO:0005083
+    TERM    small GTPase regulatory/interacting protein activity
+    CORRECTED P-VALUE       0.0340606972468798
+    UNCORRECTED P-VALUE     0.00340606972468798
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 54 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+    -- 4 of 15--
+    GOID    GO:0030695
+    TERM    GTPase regulator activity
+    CORRECTED P-VALUE       0.0475469908576535
+    UNCORRECTED P-VALUE     0.00475469908576535
+    NUM_ANNOTATIONS 2 of 12 in the list, vs 64 of 7272 in the genome
+    The genes annotated to this node are:
+    YDL234C, YDL226C
+
+=head1 AUTHORS
+
+Gavin Sherlock, sherlock@genome.stanford.edu
+
+=cut
--- a/workflow/templates/qhtcp/REMc/GTF/Process/gene_association.sgd
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/gene_association.sgd
--- a/workflow/templates/qhtcp/REMc/GTF/Process/gene_ontology_edit.obo
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/gene_ontology_edit.obo
--- a/workflow/templates/qhtcp/REMc/GTF/Process/terms2tsv_v4.pl
+++ b/workflow/templates/qhtcp/REMc/GTF/Process/terms2tsv_v4.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use diagnostics;
+
+use File::Map qw(map_file);
+
+my $infile = shift;
+
+my $input;
+
+map_file $input, $infile;
+
+{
+	local $_	= $input;
+	(my $f		= $infile) =~ s/(.*\/)?(.*)(\.[^\.]*){2}/$2/;
+	my %orfgene	= (/(Y\w+)\s+(\w+)\n/g);
+	my @indices	= (/\Q-- \E(\d+) of \d+\Q --\E/g);
+	my @ids		= (/GOID\s+GO:(\d+)/g);
+	my @terms	= (/TERM\s+(.*?)\n/g);
+	my @pvalues	= (/\nCORRECTED P-VALUE\s+(\d.*?)\n/g);
+	my @clusterf	= (/NUM_ANNOTATIONS\s+(\d+ of \d+)/g);
+	my @bgfreq	= (/, vs (\d+ of \d+) in the genome/g);
+	my @orfs	= (/The genes annotated to this node are:\n(.*?)\n/g);
+	
+	s/, /:/g for @orfs;
+
+	my @genes;
+	for my $orf (@orfs) {
+		my @otmp = split /:/, $orf;
+		my @gtmp = map { $orfgene{$_} } @otmp;
+		push @genes, (join ':', @gtmp);
+	}
+
+	&header();
+	for my $i (0 .. (@ids - 1)) {
+		&report($f, $ids[$i], $terms[$i], $pvalues[$i], $clusterf[$i], $bgfreq[$i], $orfs[$i], $genes[$i]);
+	}
+}
+
+sub header {
+	print "REMc ID\tGO_term ID\tGO-term\tCluster frequency\tBackground frequency\tP-value\tORFs\tGenes\n";
+}
+
+sub report {
+	my ($f, $id, $term, $p, $cfreq, $bgfreq, $orfs, $genes) = @_;
+
+	$cfreq =~ /(\d+) of (\d+)/;
+	$cfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
+
+	$bgfreq =~ /(\d+) of (\d+)/;
+	$bgfreq = sprintf "%d out of %d genes, %.1f%%", $1, $2, (100*$1/$2);
+
+	print "$f\t$id\t$term\t$cfreq\t$bgfreq\t$p\t$orfs\t$genes\n";
+}
				`@@ -0,0 +1 @@`
				`,jwrodger,hartmanlab.genetics.uab.edu,28.07.2022 09:10,file:///home/jwrodger/.config/libreoffice/4;`