edit

3f910780 · rictjo · fc5506eb · 3f910780 · fc5506eb · fc5506eb
19 changed file
--- a/README.md
+++ b/README.md
@@ -79,5 +79,6 @@ print ( res_dfs )
 # Manually updated code backups for this library :

 GitLab:	https://gitlab.com/richardtjornhammar/impetuous
+
 CSDN:	https://codechina.csdn.net/m0_52121311/impetuous

--- a/build/lib/impetuous/__init__.py
+++ b/build/lib/impetuous/__init__.py
-name = "impetuous-gfa"
--- a/build/lib/impetuous/clustering.py
+++ b/build/lib/impetuous/clustering.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import pandas as pd
-import numpy as np
-import sys
-import sklearn.cluster as sc
-
-try :
-	from numba import jit
-	bUseNumba = True
-except ImportError :
-	bUseNumba = False
-
-# THE FOLLOWING KMEANS ALGORITHM IS THE AUTHOR OWN LOCAL VERSION
-if bUseNumba :
-	@jit(nopython=True)
-	def seeded_kmeans( dat, cent ):
-		#
-		# PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN
-		# https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc
-		# AROUND LINE 2345
-		# AGAIN CONSIDER USING THE C++ VERSION SINCE IT IS ALOT FASTER
-		# HERE WE SPEED IT UP USING NUMBA IF THE USER HAS IT INSTALLED AS A MODULE
-		#
-		NN , MM = np.shape ( dat  )
-		KK , LL = np.shape ( cent )
-		if not LL == MM :
-			print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' )
-
-		labels = [ int(z) for z in np.zeros(NN) ]
-		w = labels
-		counts = np.zeros(KK)
-		tmp_ce = np.zeros(KK*MM).reshape(KK,MM)
-		old_error , error , TOL = 0. , 1. , 1.0E-10
-		while abs ( error - old_error ) > TOL :
-			old_error = error
-			error = 0.
-			counts = counts * 0.
-			tmp_ce = tmp_ce * 0.
-			# START BC
-			for h in range ( NN ) :
-				min_distance = 1.0E30
-				for i in range ( KK ) :
-					distance = np.sum( ( dat[h]-cent[i] )**2 ) 
-					if distance < min_distance :
-						labels[h] = i
-						min_distance = distance
-				tmp_ce[labels[h]] += dat[ h ]
-				counts[labels[h]] += 1.0
-				error += min_distance
-			# END BC
-			for i in range ( KK ) :
-				if counts[i]>0:
-					cent[i] = tmp_ce[i]/counts[i]
-		centroids = cent
-		return ( labels, centroids )
-else :
-	def seeded_kmeans( dat, cent ):
-		#
-		# SLOW SLUGGISH KMEANS WITH A DUBBLE FOR LOOP
-		# IN PYTHON! WOW! SUCH SPEED!
-		#
-		NN , MM = np.shape ( dat  )
-		KK , LL = np.shape ( cent )
-		if not LL == MM :
-			print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' )
-		labels = [ int(z) for z in np.zeros(NN) ]
-		w = labels
-		counts = np.zeros(KK)
-		tmp_ce = np.zeros(KK*MM).reshape(KK,MM)
-		old_error , error , TOL = 0. , 1. , 1.0E-10
-		while abs ( error - old_error ) > TOL :
-			old_error = error
-			error = 0.
-			counts = counts * 0.
-			tmp_ce = tmp_ce * 0.
-			# START BC
-			for h in range ( NN ) :
-				min_distance = 1.0E30
-				for i in range ( KK ) :
-					distance = np.sum( ( dat[h]-cent[i] )**2 ) 
-					if distance < min_distance :
-						labels[h] = i
-						min_distance = distance
-				tmp_ce[labels[h]] += dat[ h ]
-				counts[labels[h]] += 1.0
-				error += min_distance
-			# END BC
-			for i in range ( KK ) :
-				if counts[i]>0:
-					cent[i] = tmp_ce[i]/counts[i]
-		centroids = cent
-		return ( labels, centroids )
-
-from scipy.spatial.distance import squareform , pdist
-absolute_coordinates_to_distance_matrix = lambda Q:squareform(pdist(Q))
-
-distance_matrix_to_geometry_conversion_notes = """
-*) TAKE NOTE THAT THE OLD ALGORITHM CALLED DISTANCE GEOMETRY EXISTS. IT CAN BE EMPLOYED TO ANY DIMENSIONAL DATA. HERE YOU FIND A SVD BASED ANALOG OF THAT OLD METHOD.
-
-*) PDIST REALLY LIKES TO COMPUTE SQUARE ROOT OF THINGS SO WE SQUARE THE RESULT IF IT IS NOT SQUARED.
-
-*) IN SHORT THE DISTANCE MATRIX IN THE CONVERSION ROUTINE BACK TO ABSOLUTE COORDINATES USES R2 DISTANCES.
-"""
-
-if bUseNumba :
-	@jit(nopython=True)
-	def distance_matrix_to_absolute_coordinates ( D , bSquared = False, n_dimensions=2 ):
-		if not bSquared :
-			D = D**2.
-		DIM = n_dimensions
-		DIJ = D*0.
-		M = len(D)
-		for i in range(M) :
-			for j in range(M) :
-				DIJ[i,j] = 0.5* (D[i,-1]+D[j,-1]-D[i,j])
-		D = DIJ
-		U,S,Vt = np.linalg.svd ( D , full_matrices = True )
-		S[DIM:] *= 0.
-		Z = np.diag(S**0.5)[:,:DIM]
-		xr = np.dot( Z.T,Vt )
-		return ( xr )
-else :
-        def distance_matrix_to_absolute_coordinates ( D , bSquared = False, n_dimensions=2 ):
-                if not bSquared :
-                        D = D**2.
-                DIM = n_dimensions
-                DIJ = D*0.
-                M = len(D)
-                for i in range(M) :
-                        for j in range(M) :
-                                DIJ[i,j] = 0.5* (D[i,-1]+D[j,-1]-D[i,j])
-                D = DIJ
-                U,S,Vt = np.linalg.svd ( D , full_matrices = True )
-                S[DIM:] *= 0.
-                Z = np.diag(S**0.5)[:,:DIM]
-                xr = np.dot( Z.T,Vt )
-                return ( xr )
-
-
-def connectivity ( B , val, bVerbose=True ) :
-	description="""
-This is a cutoff based clustering algorithm. The intended use is to supply a distance matrix and a cutoff value (then becomes symmetric positive definite).  For a small distance cutoff, you should see all the parts of the system and for a large distance cutoff, you should see the entire system. It has been employed for statistical analysis work as well as the original application where it was employed to segment molecular systems.
-        """
-	# PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN
-	# https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc
-	# AROUND LINE 2277
-	# CONSIDER COMPILING AND USING THAT AS A MODULE INSTEAD OF THIS SINCE IT IS
-	# A LOT FASTER
-	# FOR A DESCRIPTION READ PAGE 30 (16 INTERNAL NUMBERING) of:
-	# https://kth.diva-portal.org/smash/get/diva2:748464/FULLTEXT01.pdf
-	#
-	nr_sq,mr_sq = np.shape(B)
-	if nr_sq != mr_sq :
-		print ( 'ERROR' )
-		exit (1)
-	N = mr_sq
-	res , nvisi, s, NN, ndx, C = [], [], [], [], [], 0
-	res .append(0)
-	for i in range(N) :
-		nvisi.append(i+1)
-		res.append(0); res.append(0)
-		ndx.append(i)		
-	while ( len(ndx)>0 ) :
-		i = ndx[-1] ; ndx = ndx[:-1]
-		NN = []
-		if ( nvisi[i]>0 ) :
-			C-=1
-			for j in range(N) :
-				if ( B[i,j]<val ) :
-					NN.append(j)
-			while ( len(NN)>0 ) :
-				# back pop_back
-				k = NN[-1]; NN = NN[:-1]
-				nvisi[k] = C
-				for j in range(N):
-					if ( B[j,k]<val ) :
-						for q in range(N) :
-							if ( nvisi[q] == j+1 ) :
-								NN.append(q)
-	if bVerbose : # VERBOSE
-		print("INFO "+str(-1*C) +" clusters" )
-	Nc = [ 0 for i in range(-1*C) ]
-	for q in range(N) :
-		res[  q*2+1 ] = q;
-		res[  q*2   ] = nvisi[q]-C;
-		Nc [res[q*2]]+= 1;
-		if bVerbose :
-			print ( " "+str(res[q*2])+" "+str(res[2*q+1]) )
-	for i in range(-1*C) :
-		print( "CLUSTER "  +str(i)+ " HAS " + str(Nc[i]) + " ELEMENTS")
-	return ( Nc , np.array(res[:-1]).reshape(-1,2) )
-
-
-clustering_algorithm = None
-clustering_algorithm = sc.KMeans(10) # CHOOSE SOMETHING YOU LIKE NOT THIS
-
-class Cluster(object):
-    def __init__( self, nbins=50, nclusters=-1 , use_ranks = False ) :
-        from sklearn.cluster import KMeans
-        from sklearn.decomposition import PCA
-        from numpy import histogram2d
-        from scipy.stats import rankdata
-        self.use_ranks = use_ranks
-        self.nclusters = nclusters
-        self.nbins = nbins
-        self.histogram2d = histogram2d
-        self.KMeans = KMeans
-        self.rankdata = rankdata
-        self.pca_f = PCA(2)
-        self.centroids_ = None
-        self.labels_ = None
-        self.df_ = None
-        self.num_index_ = None
-        self.components_ = None
-
-    def approximate_density_clustering( self, df, nbins=None ) :
-        #
-        # GENES APPROX 20K OK SO APPROX 50 BINS
-        # ANALYTES ON ROWS, SAMPLE POINTS ON COLUMNS
-        if nbins is None :
-            nbins = self.nbins
-        self.df_= df
-        frac_df = df
-        if self.use_ranks :
-            frac_df .apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
-        self.pca_f.fit(frac_df.T.values)
-        self.components_ = self.pca_f.components_
-        vals,xe,ye = self.histogram2d(self.pca_f.components_[0],self.pca_f.components_[1],bins=nbins)
-        mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
-        svs = np.sqrt(svsx**2+svsy**2)
-        #
-        # IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
-        # SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
-        hits = vals>mvs+0.5*svs
-        #
-        xe_,ye_ = 0.5*(xe[:1]+xe[1:]) , 0.5*(ye[:1]+ye[1:])
-        idx = np.where(hits); xi,yj = idx[0],idx[1]
-        centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
-        if self.nclusters == -1 :
-            self.nclusters = len ( centroids )
-        if self.nclusters  < len ( centroids ) :
-            import heapq
-            from scipy.spatial import distance as distance_
-            a = distance_.cdist ( centroids, centroids, 'euclidean' )
-            cent_idx = heapq.nlargest ( self.nclusters, range(len(a)), a.reshape(-1).__getitem__ )
-            centroids = [ centroids[ idx ] for idx in cent_idx ]
-
-        kmeans = self.KMeans(len(centroids),init=np.array(centroids))
-        kmeans.fit(self.pca_f.components_.T)
-        centers = np.array(kmeans.cluster_centers_).T
-        self.labels_ = kmeans.labels_
-        self.centroids_ = centers
-        self.analyte_dict_ = { c:[] for c in self.labels_ }
-        [self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
-        return ( self.analyte_dict_ )
-
-    def write_gmt(self, filename = './cluster_file.gmt' ) :
-        with open(filename,'w') as of :
-            for k,v in self.analyte_dict_.items() :
-                print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of )
-
-
-class ManifoldClustering ( Cluster ) :
-    def __init__( self , nbins=50 ) :
-        from sklearn.cluster import KMeans
-        from sklearn.manifold import MDS, TSNE
-        from numpy import histogram2d
-        from scipy.stats import rankdata
-        self.nbins = nbins
-        self.histogram2d = histogram2d
-        self.KMeans = KMeans
-        self.rankdata = rankdata
-        self.mds  = MDS ( n_components=2 )
-        self.tsne = TSNE ( n_components=2 )
-        self.man = None
-        self.centroids_ = None
-        self.labels_ = None
-        self.df_ = None
-        self.num_index_ = None
-        self.components_ = None
-
-    def approximate_embedding( self, df, nbins=None , use_tsne=True ) :
-        self.man = self.tsne
-        if not use_tsne :
-            self.man = self.mds
-            print ( 'WARNING::SLOW AND WASTEFUL' )
-        if nbins is None :
-            nbins = self.nbins
-        self.df_= df
-        frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
-        self.components_ = np.array(self.man.fit_transform(frac_df.values)).T
-        vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins)
-        mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
-        svs = np.sqrt( svsx**2 + svsy**2 )
-        #
-        # IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
-        # SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
-        hits = vals>mvs+0.5*svs
-        #print(hits,vals)
-        xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:])
-        idx = np.where(hits); xi,yj = idx[0],idx[1]
-        centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
-        #
-        kmeans = self.KMeans(len(centroids),init=np.array(centroids))
-        kmeans.fit(self.components_.T)
-        centers = np.array(kmeans.cluster_centers_).T
-        self.labels_ = kmeans.labels_
-        self.centroids_ = centers
-        self.analyte_dict_ = { c:[] for c in self.labels_ }
-        [self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
-        return ( self.analyte_dict_ )
-
-
-def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gmt' ) :
-    labels = ca.fit_predict(df.values)
-    llabs = [ l for l in labels ]; ulabs=set(llabs)
-    with open(filename,'w') as of :
-        for ulab in ulabs :
-            analytes = df.iloc[llabs==ulab].index.values
-            print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of )
-
-
-def projection_knn_assignment ( projected_coords , df , NMaxGuess=-1 , n_dimensions=2  ) :
-    coords_s = projected_coords.dropna( 0 )
-    centroid_coordinates = []
-    for row in df.T :
-        guess = sorted ( [ (v,i) for (v,i) in zip( df.loc[row].values,df.loc[row].index ) ] ) [::-1][:NMaxGuess]
-        maxWeights = [ i[1] for i in guess ]
-        use = df.loc[row,maxWeights]
-        S = np.sum ( use.values )
-        S = 1. if S==0 else S
-        crd = np.dot(use.values,coords_s.loc[use.index.values].values)/S
-        centroid_coordinates.append(crd)
-
-    centroids_df = pd.DataFrame ( centroid_coordinates , index=df.index , columns=[ 'C'+str(i) for i in range(n_dimensions) ] )
-    labels , centroids = seeded_kmeans( coords_s.values,centroids_df.values )
-    coords_s.loc[:,'owner'] = centroids_df.iloc[labels].index.values
-    for i in range(len(centroids.T)) :
-        centroids_df.loc[:,'E'+str(i) ] = (centroids.T)[i]
-    return ( centroids_df , coords_s )
-
-
-def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False ,
-                                    output_name = 'feature_clusters_output.csv' 
-                                  ) :
-    x_pc1 = CLUSTER.components_[0]
-    y_pc2 = CLUSTER.components_[1]
-    L_C   = len(CLUSTER.centroids_[0])
-    #
-    # MAKE CLUSTER COLORS
-    make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
-    C0 = [255,255,255] ; cluster_colors = []
-    #
-    for i in CLUSTER.labels_ :
-        C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255))
-        cluster_colors.append(make_hex_colors(C0_))
-
-    if not df is None :
-        if add_synonyms :
-            synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \
-                else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \
-                else df.index.values[i] for i in range(len(px))]
-        else :
-            synonyms = df.index.values
-    data = []
-    for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] ,
-                              [cluster_colors[cl] for cl in CLUSTER.labels_] ) :
-        data.append([x,y,t,cl,co])
-    clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color'])
-    if not df is None :
-        clustering_df.index =  df.index.values 
-    clustering_df.to_csv( output_name , '\t' )
-    return ( clustering_df )
-
-def backprojection_clustering ( analyte_df , bRanked=False , n_dimensions=2 ,
-                                bDoFeatures=True , bDoSamples=True ) :
-    from scipy.stats import rankdata
-    if bRanked :
-        rana_df = analyte_df .apply( lambda x:(rankdata(x,'average')-0.5)/len(x) )
-    else :
-        rana_df = analyte_df
-
-    dimcrdnames = [ 'd'+str(i) for i in range(n_dimensions) ]
-    #
-    # Do backprojection clustering with backprojection
-    cluster_coords_f = None
-    if bDoFeatures :
-        #
-        dM1 = absolute_coordinates_to_distance_matrix( rana_df.values   )
-        pd.DataFrame(dM1,index=rana_df.index,columns=rana_df.index).to_csv('../data/dM1.tsv','\t')
-        #
-        # Project it back onto first two components
-        max_var_projection = distance_matrix_to_absolute_coordinates ( dM1 , n_dimensions=n_dimensions )
-        cluster_coords_f = pd.DataFrame( max_var_projection ,
-                                    columns = rana_df.index ,
-                                    index = dimcrdnames ).T
-    cluster_coords_s = None
-    if bDoSamples :
-        #
-        # And again for all the samples
-        dM2 = absolute_coordinates_to_distance_matrix( rana_df.T.values )
-        pd.DataFrame(dM2,index=rana_df.columns,columns=rana_df.columns).to_csv('../data/dM2.tsv','\t')
-        #
-        # This algorithm is exact but scales somewhere between n^2 and n log n
-        max_var_projection = distance_matrix_to_absolute_coordinates ( dM2 , n_dimensions=n_dimensions )
-        cluster_coords_s = pd.DataFrame( max_var_projection ,
-                                    columns = rana_df.columns ,
-                                    index = dimcrdnames ).T
-        cluster_coords_s.to_csv('../data/conclust_s.tsv','\t')
-
-    return ( cluster_coords_f,cluster_coords_s )
-
-
-if __name__ == '__main__' :
-
-    if False :
-        #
-        # TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
-        filename = './Diabetes_collapsed_symbols.gct'
-        df_ = pd.read_csv(filename,'\t',index_col=0,header=2)
-        ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]] 
-        ddf .index = [idx.split('/')[0] for idx in ddf.index]
-        run_clustering_and_write_gmt( ddf , clustering_algorithm )
-        #
-        CLU = Cluster( )
-        CLU .approximate_density_clustering(ddf)
-        CLU .write_gmt()
-
-    if False :
-        A = np.array( [ [0.00, 0.10, 0.10, 9.00, 9.00, 9.00],
-        		[0.10, 0.00, 0.15, 9.00, 9.00, 9.00],
-        		[0.10, 0.15, 0.00, 9.00, 9.00, 9.00],
-        		[9.00, 9.00, 9.00, 0.00, 0.10, 0.10],
-        		[9.10, 9.00, 9.00, 0.10, 0.00, 0.15],
-        		[9.10, 9.00, 9.00, 0.10, 0.15, 0.00] ] )
-        print( connectivity(A,0.01) )
-
--- a/build/lib/impetuous/convert.py
+++ b/build/lib/impetuous/convert.py
-"""
-Copyright 2019 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import pandas as pd
-import numpy as np
-import networkx as nx
-from networkx.readwrite import json_graph
-import json
-import sys
-
-def drop_duplicate_indices( df ):
-    df_ = df.loc[~df.index.duplicated(keep='first')]
-    return df_
-
-def write_tree( tree , outfile='tree.json' ):
-    root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ]
-    o_json = json_graph.tree_data( tree , root )
-    if not outfile is None:
-        with open(outfile, 'w') as o_file:
-            json.dump(o_json, o_file )
-    return( o_json )
-
-def add_attributes_to_tree ( p_df , tree ):
-    id_lookup = { rid:lid for (lid,rid) in nx.get_node_attributes(tree,'source').items() }
-    add_attributes = p_df.columns.values
-    for attribute in add_attributes:
-        propd = { id_lookup[idx]:{attribute:val} for (idx,val)
-                      in zip(p_df.index.values,p_df.loc[:,attribute]) if idx in set(id_lookup.keys()) }
-        nx.set_node_attributes( tree , propd )
-    return( tree )
-
-def parent_child_to_dag (
-             relationship_file = './PCLIST.txt' ,
-             i_p = 0 , i_c = 1
-           ) :
-    n_df = pd .read_csv ( relationship_file , '\t' )
-    pair_tuples = [ (p,c) for (p,c) in zip(n_df.iloc[:,i_p],n_df.iloc[:,i_c]) ]
-    children_of = {} ; all_names = set([])
-    for ( p,c ) in pair_tuples :
-        if not 'list' in str(type(c)):
-            C = [c]
-        all_names = all_names | set([p]) | set(C)
-        if p in children_of :
-            children_of[p] .append(c)
-        else :
-            children_of[p] = [c]
-    G = nx .DiGraph()
-    G .add_nodes_from (  all_names  )
-    G .add_edges_from ( pair_tuples )
-    tree = nx.algorithms.dag.dag_to_branching( G )
-    root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ]
-    descendants = [ ( idx , nx.algorithms.dag.descendants(G,idx) ) for idx in all_names ]
-    ancestors   = [ ( idx , nx.algorithms.dag.ancestors(G,idx) ) for idx in all_names ]
-    return ( tree,ancestors,descendants )
-
-def make_pathway_ancestor_data_frame(ancestors):
-    p_df = None
-    for k,v in ancestors :
-        t_df = pd.DataFrame([[','.join(list(v)),len(v)]],index=[k],columns=['DAG,ancestors','DAG,level'])
-        if p_df is None :
-            p_df = t_df
-        else :
-            p_df = pd.concat([p_df,t_df])
-    return( p_df )
-
-def normalise_for_apples_and_oranges_stats( X , method='ordinal' ):
-    X_ = rankdata( X , method=method )/len(X)
-    return(X_)
-
-def make_group_analytes_unique( grouping_file , delimiter='\t' ):
-    uniqe_grouping_file = '/'.join(grouping_file.split('/')[:-1]) + '/unique_'+grouping_file.split('/')[-1]
-    with open( uniqe_grouping_file , 'w' ) as of:
-        with open( grouping_file ) as input :
-            for line in input :
-                vline = line.replace('\n','').split(delimiter)
-                gid, gdesc, analytes_ = vline[0], vline[1], list(set(vline[2:]))
-                nvec = [gid,gdesc] ; [ nvec.append(a) for a in analytes_ ]
-                print ( delimiter.join(nvec) , file = of )
-
-def read_conversions(file_name) :
-    gene2ens = {} ; non_unique = []
-    with open( file_name , 'r' ) as infile:
-        if sys.version_info[0] < 3:
-            infile.next()
-        else :
-            next(infile)
-        for line in infile:
-            words = line.strip().split('\t')
-            if len(words)==2 :
-                ens, gene = words
-                if gene in gene2ens:
-                    gene2ens[gene].append(ens)
-                else :
-                    gene2ens[gene] = [ens]
-    return gene2ens
-
-def read_gene_ensemble_conversion(file_name):
-    gene2ens = {} ; non_unique = []
-    with open(file_name,'r') as infile:
-        if sys.version_info[0] < 3:
-            infile.next()
-        else:
-            next(infile)
-        for line in infile:
-            words = line.strip().split('\t')
-            if len(words)==2 :
-                ens, gene = words
-                if gene in gene2ens:
-                    non_unique.append((gene,ens,gene2ens[gene]))
-                else :
-                    gene2ens[gene] = ens
-        if len(non_unique)>0:
-            print(' WARNING ' )
-            print( 'FOUND ', len(non_unique), ' NON UNIQUE ENTRIES' )
-    return gene2ens
-
-def create_synonyms( convert_file , unique_mapping=False ):
-    # CREATE SYNONYMS
-    ens2sym , sym2ens = {} , {}
-    if unique_mapping:
-        sym2ens = read_gene_ensemble_conversion( convert_file )
-        ens2sym = { v:k for k,v in sym2ens.items() }
-    else :
-        sym2ens_list = read_conversions( convert_file )
-        ens2sym_list = {}
-        for s,L in sym2ens_list.items() :
-            for e in L:
-                if e in ens2sym_list:
-                    ens2sym_list.append(s)
-                else:
-                    ens2sym_list[e] = [s]
-        ens2sym = ens2sym_list
-        sym2ens = sym2ens_list
-    return ( ens2sym , sym2ens )
-
-def flatten_dict( s2e ) :
-    # FORCES FIRST ELEMENT
-    ndict = {}
-    for (s,e) in s2e.items() :
-        if 'list' in str(type(e)) :
-            ndict[s] = e[0]
-        else :
-            ndict[s] = e
-    return ( ndict )
-
-def convert_rdata_to_dataframe ( filename ) :
-    #
-    from rpy2.robjects import r as R
-    from rpy2.robjects.packages import importr
-    from rpy2.robjects import pandas2ri
-    from rpy2.robjects.conversion import localconverter
-    import rpy2.robjects as ro
-    #
-    print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' )
-    rd_ = R.load( filename )
-    if 'matrix' in str( type( R[rd_[0]] ) ).lower() :
-        column_names = [ R[rd_[0]].colnames ]
-        index_names  = [ R[rd_[0]].rownames ]
-    else :
-        column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]]
-        index_names  = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]]
-    #
-    pandas2ri.activate()
-    #
-    # SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO
-    # A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING
-    #
-    rd = R.load( filename )
-    raw_df_l = []
-    if 'ndarray' in str( type( R[rd[0]] ) ).lower() :
-        [ raw_df_l.append( R[rd[0]] ) ]
-    else :
-        [ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ]
-    full_df_dict = {} ; i_ = 0
-    for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) :
-        pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames ) 
-        full_df_dict[i_] = pdf
-        i_ = i_ + 1
-    pandas2ri.deactivate()
-    return ( full_df_dict )
-
-import os
-if __name__ == '__main__' :
-    #
-    bMOFA_data = True
-    if bMOFA_data :
-        import os
-        # os.system('mkdir ../data')
-        # os.system('wget https://github.com/bioFAM/MOFAdata/blob/master/data/CLL_data.RData')
-        # os.system('mv CLL_data.RData ../data/.')
-
-        df_dict = convert_rdata_to_dataframe( filename = '../data/CLL_data.RData' )
-        pruned_df = df_dict[2].T.dropna().T
-        journal_df = df_dict[3].loc[['IGHV'],:]
-        mask = [ v>=0 for v in journal_df.loc['IGHV'].values ]
-        journal_df = journal_df.iloc[ :,mask ]
-        use = list(set(pruned_df.columns)&set(journal_df.columns))
-        analyte_df =  pruned_df .loc[ :,use ].apply( lambda x:np.log2(1+x) )
-        journal_df = journal_df .loc[ :,use ]
-
-        print ( analyte_df,journal_df )
-        from impetuous.quantification import *
-        qdf = quantify_groups ( analyte_df , journal_df , 'anova~C(IGHV)' , '~/Work/data/naming_and_annotations/reactome/reactome.gmt' )
-        print(qdf)
-        exit(1)
-
-    base = '../../../data/'
-    convert_file = base + 'naming_and_annotations/conv.txt'
-    ens2sym , sym2ens = create_synonyms( convert_file )
-    s2e = { **flatten_dict(sym2ens) }
-
-    name,col,sep = 'Networks.nfo',0,'\t'
-    with open(name,'r') as input:
-        for line in input:
-            gname = line.split(sep)[col].replace('\n','')
-            if gname in s2e:
-                print(gname)
-            else:
-                print('MISSING:',gname)
--- a/build/lib/impetuous/fit.py
+++ b/build/lib/impetuous/fit.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import numpy as np
-import pandas as pd
-
-def read_xyz(name='data/naj.xyz',header=2,sep=' '):
-    mol_str = pd.read_csv(name,header=header)
-    P=[]
-    for i_ in range(len(mol_str.index)):
-        line = mol_str.iloc[i_,:].values[0]
-        lsp = [l.replace(' ','') for l in line.split(sep) if len(l)>0]
-        P.append(lsp)
-    pdf = pd.DataFrame(P); pdf.index=pdf.iloc[:,0].values ; pdf=pdf.iloc[:,1:4]
-    return(pdf.apply(pd.to_numeric))
-
-def KabschAlignment( P,Q ):
-    #
-    # https://en.wikipedia.org/wiki/Kabsch_algorithm
-    # C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc
-    #	IN VINCINITY OF LINE 524
-    #
-    N,DIM  = np.shape( P )
-    M,DIM  = np.shape( Q )
-    if DIM>N or not N==M :
-        print( 'MALFORMED COORDINATE PROBLEM' )
-        exit( 1 )
-
-    q0 , p0 = np.mean(Q,0) , np.mean(P,0)
-    cQ , cP = Q - q0 , P - p0
-
-    H = np.dot(cP.T,cQ) 
-    I  = np.eye( DIM )
-
-    U, S, VT = np.linalg.svd( H, full_matrices=False )
-    Ut = np.dot( VT.T,U.T )
-    I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1
-    ROT = np.dot( VT.T,np.dot(I,U.T) )
-    B = np.dot(ROT,P.T).T + q0 - np.dot(ROT,p0)
-
-    return ( B )
-
-
-def WeightsAndScoresOf( P , bFA=False ) :
-	p0 = np.mean( P,0 )
-	U, S, VT = np.linalg.svd( P-p0 , full_matrices=False )
-	weights = U
-	if bFA :
-		scores = np.dot(S,VT).T
-		return ( weights , scores )	
-	scores = VT.T
-	return ( weights , scores )
-
-def ShapeAlignment( P, Q ,
-		 bReturnTransform = False ,
-		 bShiftModel = True ,
-		 bUnrestricted = False ) :
-    #
-    # [*] C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc
-    # FIND SHAPE FIT FOR A SIMILIAR CODE IN THE RICHFIT REPO
-    #
-    description = """
-     A NAIVE SHAPE FIT PROCEDURE TO WHICH MORE SOPHISTICATED
-     VERSIONS WRITTEN IN C++ CAN BE FOUND IN MY C++[*] REPO
-     
-     HERE WE WORK UNDER THE ASSUMPTION THAT Q IS THE MODEL
-     SO THAT WE SHOULD HAVE SIZE Q < SIZE P WITH UNKNOWN 
-     ORDERING AND THAT THEY SHARE A COMMON SECOND DIMENSION
-     
-     IN THIS ROUTINE THE COARSE GRAINED DATA ( THE MODEL ) IS
-     MOVED TO FIT THE FINE GRAINED DATA ( THE DATA )
-    """
-
-    N,DIM  = np.shape( P )
-    M,DIM  = np.shape( Q )
-    W = (N<M)*N+(N>=M)*M
-
-    if (DIM>W or N<M) and not bUnrestricted :
-        print ( 'MALFORMED PROBLEM' )
-        print ( description )
-        exit ( 1 )
-
-    q0 , p0 = np.mean(Q,0) , np.mean(P,0)
-    cQ , cP = Q - q0 , P - p0
-    sQ = np.dot( cQ.T,cQ )
-    sP = np.dot( cP.T,cP )
-
-    H = np.dot(sP.T,sQ) 
-    I = np.eye( DIM )
-
-    U, S, VT = np.linalg.svd( H, full_matrices=False )
-    Ut = np.dot( VT.T,U.T )
-    I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1
-    ROT = np.dot( VT.T,np.dot(I,U.T) )
-    if bReturnTransform :
-        return ( ROT,q0,p0 )
-
-    if bShiftModel :# SHIFT THE COARSE GRAINED DATA
-        B = np.dot(ROT,Q.T).T +p0 - np.dot(ROT,q0)
-    else : # SHIFT THE FINE GRAINED DATA    
-        B = np.dot(ROT,P.T).T +q0 - np.dot(ROT,p0)
-
-    return ( B )
-
-
-def low_missing_value_imputation ( fdf , fraction = 0.9 , absolute = 'True' ) :
-    # THIS SVD BASED IMPUTATION METHOD WAS FIRST WRITTEN FOR THE RANKOR PACKAGE
-    # ORIGINAL CODE IN https://github.com/richardtjornhammar/rankor/blob/master/src/rankor/imputation.py
-    #
-    import numpy as np
-    #
-    # fdf is a dataframe with NaN values
-    # fraction is the fraction of information that should be kept
-    # absolute is used if the data is positive
-    #
-    V = fdf.apply(pd.to_numeric).fillna(0).values
-    u,s,vt = np.linalg.svd(V,full_matrices=False)
-    s =  np.array( [ s[i_] if i_<np.floor(len(s)*fraction) else 0 for i_ in range(len(s)) ] )
-    nan_values = np.dot(np.dot(u,np.diag(s)),vt)
-    if absolute :
-        nan_values = np.abs(nan_values)
-    #
-    # THIS CAN BE DONE BETTER
-    for j in range(len(fdf.columns.values)):
-        for i in range(len(fdf.index.values)):
-            if 'nan' in str(fdf.iloc[i,j]).lower():
-                fdf.iloc[i,j] = nan_values[i,j]
-    return ( fdf )
-
-
-if __name__ == '__main__' :
-	#
-	# IF YOU REQUIRE THE DATA THEN LOOK IN :
-	# https://github.com/richardtjornhammar/RichTools
-	# WHERE YOU CAN FIND THE FILES USED HERE
-	#
-	if True :
-		colors = {'H':'#777777','C':'#00FF00','N':'#FF00FF','O':'#FF0000','P':'#FAFAFA'}
-		Q = read_xyz( name='data/naj.xyz'   , header=2 , sep=' ' )
-        
-	if False : # TEST KABSCH ALGORITHM
-		P = Q .copy()
-		Q = Q * -1
-		Q = Q + np.random.rand(Q.size).reshape(np.shape(Q.values))
-	   
-		P_ , Q_ = P.copy() , Q.copy()
-		P = P_.values
-		Q = Q_.values
-		B = KabschAlignment( P,Q )
-		B = pd.DataFrame( B , index = P_.index.values ); print( pd.concat([Q,B],1))
-   
-	if True : # TEST MY SHAPE ALGORITHM
-            P = read_xyz ( name='data/cluster0.xyz' , header=2 , sep='\t' )
-            P_ , Q_= P.values,Q.values
-            B_ = ShapeAlignment( P_,Q_ )
-            B = pd.DataFrame(B_, index=Q.index,columns=Q.columns)
-            pd.concat([B,P],0).to_csv('data/shifted.xyz','\t')
-
-
-
-            
--- a/build/lib/impetuous/hierarchal.py
+++ b/build/lib/impetuous/hierarchal.py
-"""
-Copyright 2019 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import pandas as pd
-import numpy as np
-from impetuous.quantification import group_significance
-from impetuous.convert import *
-
-def pathway_frame_from_file ( filename , 
-        delimiter = '\t' , item_sep = ',' ) :
-    pdf = None
-    with open( filename,'r' ) as input :
-        for line in input :
-            lspl = line.replace('\n','').split(delimiter)
-            analytes_ = lspl[2:]
-            desc = lspl[1]
-            iden = lspl[0]
-            ps = pd.Series( [desc , item_sep.join(analytes_) ] , 
-                        name = iden , index = ['description','analytes'] )
-            pdf = pd.concat([pdf,pd.DataFrame(ps).T])
-    return ( pdf )
-
-def create_dag_representation_df ( pathway_file = '../data/GROUPDEFINITIONS.gmt' , 
-                                   pcfile = '../data/PCLIST.txt' 
-                                 ) :
-    pc_list_file = pcfile
-    tree , ance , desc = parent_child_to_dag ( pc_list_file )
-    pdf = make_pathway_ancestor_data_frame ( ance )
-    pdf_ = pathway_frame_from_file( pathway_file )
-    pdf.index = [v.replace(' ','') for v in  pdf.index.values]
-    pdf_.index= [v.replace(' ','') for v in pdf_.index.values]
-    dag_df = pd.concat([pdf.T,pdf_.T]).T
-    return ( dag_df , tree )
-
-def HierarchalEnrichment (
-            analyte_df , dag_df , dag_level_label = 'DAG,l' ,
-            ancestors_id_label = 'aid' , id_name = None , threshold = 0.05 ,
-            p_label = 'C(Status),p', analyte_name_label = 'analytes' ,
-            item_delimiter = ',' , alexa_elim=False , alternative = 'two-sided'
-        ) :
-    #
-    # NEEDS AN ANALYTE SIGNIFICANCE FRAME:
-    #     INCLUDING P VALUES OF ANALYTES
-    # DAG GRAPH DESCRIPTION FRAME:
-    #     INCLUDING NODE ID, NODE ANALYTES FIELD (SEPERATED BY ITEM DELIMITER)
-    #     INCLUDING ANCESTORS FIELD (SEPERATED BY ITEM DELIMITER)
-    #     DAG LEVEL OF EACH NODE
-    tolerance = threshold
-    df = dag_df ; dag_depth = np.max( df[dag_level_label].values )
-    AllAnalytes = set( analyte_df.index.values ) ; nidx = len( AllAnalytes )
-    SigAnalytes = set( analyte_df.iloc[ (analyte_df.loc[:,p_label].values < tolerance), : ].index.values )
-    if len( AllAnalytes ) == len( SigAnalytes ) :
-        print ( 'THIS STATISTICAL TEST WILL BE NONSENSE' )
-        print ( 'TRY A DIFFERENT THRESHOLD' )
-    marked_analytes = {} ; used_analytes = {} ; node_sig = {}
-    for d in range( dag_depth, 0, -1 ) : 
-        # ROOT IS NOT INCLUDED
-        filter_ = df [ dag_level_label ] == d
-        nodes = df.iloc[ [i for i in np.where(filter_)[ 0 ]] ,:].index.values
-        for node in nodes :
-            if 'nan' in str(df.loc[node,analyte_name_label]).lower() :
-                continue
-            analytes_ = df.loc[node,analyte_name_label].replace('\n','').replace(' ','').split(item_delimiter)
-            try :
-                group = analyte_df.loc[[a for a in analytes_ if a in AllAnalytes] ].dropna( axis=0, how='any', thresh=analyte_df.shape[1]/2 ).drop_duplicates()
-            except KeyError as e :
-                continue
-            if node in marked_analytes :
-                unused_group = group.loc[ list( set(group.index.values)-marked_analytes[node] ) ]
-                group = unused_group
-            L_ = len( group ) ; str_analytes=','.join(group.index.values)
-            if L_ > 0 :
-                used_analytes[node] = ','.join( group.index.values )
-                pv,odds = group_significance( group , AllAnalytes=AllAnalytes, SigAnalytes=SigAnalytes , tolerance = threshold , alternative=alternative )
-                node_sig[node] = pv ; marked_ = set( group.index.values )
-                ancestors = df.loc[node,ancestors_id_label].replace('\n','').replace(' ','').split(item_delimiter)
-                if alexa_elim and pv > threshold : # USE ALEXAS ELIM ALGORITHM : IS NOT DEFAULT
-                    continue
-                for u in ancestors :
-                    if u in marked_analytes :
-                        us = marked_analytes[u]
-                        marked_analytes[u] = us | marked_
-                    else :
-                        marked_analytes[u] = marked_
-    df['Hierarchal,p'] = [ node_sig[idx] if idx in node_sig else 1. for idx in df.index.values ]
-    df['Included analytes,ids'] = [ used_analytes[idx] if idx in used_analytes else '' for idx in df.index.values ]
-    df = df.dropna()
-    return ( df )
--- a/build/lib/impetuous/impetuous.py
+++ b/build/lib/impetuous/impetuous.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-from impetuous.clustering import *
-from impetuous.quantification import *
-import sys
-
-if __name__ == '__main__' :
-	inventors__ = "Richard Tjörnhammar and Edward Tjörnhammar"
--- a/build/lib/impetuous/pathways.py
+++ b/build/lib/impetuous/pathways.py
-"""
-Copyright 2019 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import pandas as pd
-import numpy as np
-import re
-from impetuous.convert import read_conversions, read_gene_ensemble_conversion, create_synonyms
-
-def find_compartment( istr ) :
-    return ( re.findall( r'\[(.*?)\]', istr ) )
-
-class Group( object ) :
-    def get_next(self):
-        pass
-
-## PATHWAY SECTION
-class GenericPathway( Group ) :
-    def __init__(   self , path , gene_name_start = "ENSG0" , gene_mapping = None, # GENE ID MAPPING
-                    is_own_pathway = False, list_like = False , add_pathway_prefix='',
-                    gene_pos=0 , pathway_pos=1 , pathway_desc_pos=3, seperator='\t' ) :
-        self.file = path
-        self.prefered_genes = gene_name_start
-        self.pathways , self.pathway_names = {},{}
-        self.sep = seperator
-        self.is_own_pathway = is_own_pathway
-        self.gene_mapping = gene_mapping
-        self.gene_pos = gene_pos
-        self.pathway_pos = pathway_pos
-        self.pathway_desc_pos = pathway_desc_pos
-        self.list_like = list_like
-        self.add_pathway_prefix = add_pathway_prefix
-        self.replace_pair = None
-        self.internal_table = None
-        self.generator = None
-        self.active_read = False
-        self.restrict_id_to = None
-        if 'str' in str(type(self.file)):
-            self.read_and_parse()
-        else :
-            self.parse_df()
-
-    def add_pathway_synonyms ( self , synonym_dict , prefix='' ) :
-        for pathway,pathway_name,genes in self.get_generator() :
-            synonyms = []
-            for g in genes :
-                if g in synonym_dict or prefix+g in synonym_dict :
-                    k = prefix * ( not g in synonym_dict ) + g
-                    sdg = synonym_dict[k]
-                    if 'list' in str(type(sdg)) :
-                        [ synonyms.append(s) for s in sdg ]
-                    if 'str' in str(type(sdg)) :
-                        synonyms.append(sdg)
-            [ self.pathways[pathway].append(s) for s in synonyms ]
-
-    def make_gmt_pathway_file ( self , filename , verbose=False , delimiter='\t', gene_mapping=None ):
-        #if 'None' in str(type( self.generator )) :
-        #    self.generator = self.get_generator()
-        if not gene_mapping is None:
-            self.gene_mapping = gene_mapping
-        if 'str' in str(type(filename)) :
-            if 'str' in str( type( filename ) ) :
-                with open( filename, 'w' ) as o_file :
-                    for pathway, pathway_name, genes in self.get_generator() :
-                        row = list() ; row . append ( pathway )
-                        row . append ( pathway_name )
-                        genes_loc = genes
-                        if 'dict' in str(type(self.gene_mapping)) :
-                            genes_loc = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
-                        [ row.append ( gene ) for gene in list(set(genes_loc)) ]
-                        row_str = delimiter.join(row)
-                        print(row_str)
-                        o_file . write( row_str+'\n' )
-        else :
-            print ( 'MUST SUPPLY A FILENAME' )
-
-    def make_internal_table(self, verbose=False, delimiter='\t', output_file=None ) :
-        self.internal_table = list()
-        generator = self.get_generator( )
-        for pathway, pathway_name, genes in generator :
-            row = list() ; row . append ( pathway )
-            row .append ( pathway_name )
-            [ row .append ( gene ) for gene in genes ]
-            row_str = delimiter.join(row)
-            self.internal_table.append( row )
-
-    def parse_df(self):
-        if not self.is_own_pathway:
-            print('ERROR: OPERATION NOT SUPPORTED')
-            exit(1)
-        sfcv = set( self.file.columns.values )
-        if 'gene' in sfcv and 'pathway' in sfcv and 'description' in sfcv:
-            print ( self.file.columns )
-        else:
-            genes = self.file.index.values
-        for gene in genes :
-            pathway = self.add_pathway_prefix + gene
-            self.pathways[pathway] = [gene]
-            if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys() :
-                self.pathway_names[pathway] = self.gene_mapping[gene]
-            else :
-                self.pathway_names[pathway] = gene
-
-    def get_generator_from_df(self):
-        self.parse_df()
-        for key in self.pathways.keys():
-            yield(key,self.pathway_names[key],self.pathways[key])
-
-    def read_and_parse(self):
-        with open(self.file) as input:
-            pathway, pathway_name, genes = "", "", []
-            for line in input:
-                lspl = line.split('\t')
-                if not 'None' in str(type(self.replace_pair)):
-                    pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] )
-                else:
-                    pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] )
-                pathway_name = lspl[self.pathway_desc_pos]
-                if self.list_like :
-                    # LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING
-                    # HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS
-                    genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ]
-                    if not 'None' in str(type(self.gene_mapping)) :
-                        renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes  ]
-                        genes = renamed_genes
-                    if pathway in self.pathways :
-                        [ self.pathways[pathway].append(gene) for gene in genes ]
-                    else :
-                        self.pathways[pathway] = genes
-                        self.pathway_names[pathway] = pathway_name
-                else :
-                    if not line.startswith(self.prefered_genes) or len(line)==0:
-                        continue
-                    gene = lspl[ self.gene_pos ]
-                    if self.is_own_pathway :
-                        if gene in self.pathways :
-                            continue;
-                        else:
-                            pway=self.add_pathway_prefix + gene
-                            self.pathways[pway] = [gene]
-                            if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
-                                self.pathway_names[pway] = self.gene_mapping[gene]
-                            else:
-                                self.pathway_names[pway] = gene
-                    else :
-                        if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
-                            gene = self.gene_mapping[gene]
-                        if pathway in self.pathways:
-                            self.pathways[pathway].append(gene)
-                        else:
-                            self.pathways[pathway] = [gene]
-                            self.pathway_names[pathway] = pathway_name
-
-    def dump_pathways(self):
-        return ( self.pathways,self.pathway_names )
-
-    def get_generator( self, verbose=False ):
-        if self.active_read :
-            if not self.file is None :
-                if 'str' in str(type(self.file)):
-                    self.read_and_parse()
-                else:
-                    self.parse_df()
-        if verbose :
-            print( self.dump_pathways() )
-        for key in self.pathways.keys():
-            yield( key , self.pathway_names[key] , self.pathways[key] )
-
-
-class Reactome( GenericPathway ) :
-    def __init__(   self , path , gene_name_start = 'ENSG0' ,pathway_desc_pos=3,
-                    gene_mapping = None, lexical_restrictions = None, # GENE ID MAPPING
-                    is_own_pathway = False, restrict_id_to = None ) :
-        self.file = path
-        self.prefered_genes = gene_name_start
-        self.pathways , self.pathway_names ,self.pathway_compartments = {},{},{}
-        self.is_own_pathway = is_own_pathway
-        self.gene_mapping = gene_mapping
-        self.gene_pos = 0
-        self.pathway_pos = 1
-        self.pathway_desc_pos = pathway_desc_pos
-        self.list_like = False
-        self.add_pathway_prefix = ''
-        self.replace_pair = None
-        self.lexical_restrictions = lexical_restrictions
-        self.lexical_restriction_category_pos=None
-        self.skipstr = None
-        self.pathway_tag = None
-        self.restrict_id_to = restrict_id_to
-        self.active_read = False
-        if 'str' in str(type(self.file)):
-            self .read_and_parse(keep_str='ENSG0')
-        else:
-            self .parse_df()
-
-    def read_and_parse( self , keep_str=None ) :
-        with open( self.file ) as input :
-            pathway, pathway_name, genes = "", "", []
-            for line in input :
-                if not self.skipstr is None :
-                    if line.startswith(self.skipstr):
-                        continue
-                if not keep_str is None :
-                    if not line.startswith(keep_str):
-                        continue
-                lspl = line.split('\t')
-                if ('[' in line) and (']' in line) :
-                    compartment = find_compartment( line )[0]
-                else :
-                    compartment = ''
-                if not 'None' in str(type(self.replace_pair)) :
-                    pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] )
-                else :
-                    pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] )
-                if not self.restrict_id_to is None :
-                    if not pathway in self.restrict_id_to:
-                        continue
-                pathway_name = lspl[ self.pathway_desc_pos ]
-                if not self.lexical_restrictions is None:
-                    if not np.sum( [ int(lr in pathway_name) for lr in self.lexical_restrictions] )>0 : #or len(compartment)<4
-                        continue
-                if self.lexical_restriction_category_pos is None :
-                    lex_restrict = pathway_name
-                else :
-                    lex_restrict = lspl[ self.lexical_restriction_category_pos ]
-                if self.list_like :
-                    # LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING
-                    # HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS
-                    genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ]
-                    if not 'None' in str(type(self.gene_mapping)) :
-                        renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes  ]
-                        genes = renamed_genes
-                    if pathway in self.pathways :
-                        [ self.pathways[pathway].append(gene) for gene in genes ]
-                    else :
-                        self.pathways[pathway] = genes
-                        self.pathway_names[pathway] = pathway_name
-                        self.pathway_compartments[pathway] = compartment
-                else :
-                    if not line.startswith(self.prefered_genes) or len(line)==0:
-                        continue
-                    gene = lspl[ self.gene_pos ]
-                    if self.is_own_pathway :
-                        if gene in self.pathways :
-                            continue;
-                        else:
-                            pway = self.add_pathway_prefix + gene
-                            self.pathways[pway] = [gene]
-                            if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
-                                self.pathway_names[pway] = self.gene_mapping[gene]
-                            else:
-                                self.pathway_names[pway] = gene
-                    else :
-                        if not self.pathway_tag is None:
-                            if not self.pathway_tag in pathway:
-                                continue
-                        if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
-                            gene = self.gene_mapping[gene]
-                        if pathway in self.pathways:
-                            self.pathways[pathway].append(gene)
-                        else :
-                            if True : # self.lexical_restrictions is None :
-                                self.pathways[pathway] = [gene]
-                                self.pathway_names[pathway] = pathway_name
-                            else :
-                                if len(set(self.lexical_restrictions)-set(lex_restrict.split()))<len(self.lexical_restrictions):
-                                    if len( set(self.lexical_restrictions) & set(lex_restrict.split()) )>0:
-                                        self.pathways[pathway] = [gene]
-                                        self.pathway_names[pathway] = pathway_name
-
-    def add_pathway_synonyms( self , synonym_dict, prefix='' ) :
-        for pathway, pathway_name, genes in self.get_generator() :
-            synonyms = []
-            for g in genes:
-                if g in synonym_dict or prefix+g in synonym_dict :
-                    k = prefix * ( not g in synonym_dict ) + g
-                    sdg = synonym_dict[ k ]
-                    if 'list' in str(type(sdg)) :
-                        [ synonyms.append(s) for s in sdg ]
-                    if 'str' in str(type(sdg)) :
-                        synonyms.append(sdg)
-            [ self.pathways[pathway].append(s) for s in synonyms ]
-
-
-    def read_extra_information ( self , file_name , valpos = 0 , keypos = 1 ,
-                                 add_value_prefix = '' , required_content = 'HSA' ,
-                                 add_desc_pos = None , comp_pos = None ) :
-
-        with open ( file_name ) as input :
-            pathway, genes = "", []
-            for line in input :
-                lspl = line.split( '\t' )
-                pw = lspl[ keypos ]
-                if required_content in pw :
-                    gn = add_value_prefix + lspl[ valpos ]
-                    if not self.restrict_id_to is None : 
-                        if not pw in self.restrict_id_to :
-                            continue
-                    if pw in self.pathways :
-                        self.pathways[pw] .append(gn)
-                    else :
-                        self.pathways[pw] = [gn]
-                        self.pathway_names[ pw ]  = ''
-                    if not add_desc_pos is None :
-                        if lspl[add_desc_pos] not in self.pathway_names[pw] :
-                            self.pathway_names[pw] += lspl[add_desc_pos] + ', '
-                    if not comp_pos is None :
-                        if self.pathway_compartments is None :
-                            self.pathway_compartments = {}
-                        if pw not in self.pathway_compartments :
-                            self.pathway_compartments[pw] = [ ''.join(find_compartment(lspl[comp_pos])) ]
-                        else :
-                            self.pathway_compartments[pw] .append( ''.join(find_compartment(lspl[comp_pos])) )
-
-    def add_extra_information_from_dictionary( self , dictionary, map_onto_genes=True ):
-        for pathway in self.pathways :
-            genes = self.pathways[pathway]
-            add_these = []
-            [ [ add_these.append(v) for v in dictionary[g] ] for g in genes if g in dictionary.keys() ]
-            if len(add_these)>0:
-                [ self.pathways[pathway].append(a) for a in add_these]
-
-def print_generator( generator , show_max_nr=3 , find_str=None ):
-    for pathway,pathway_name,genes in generator:
-        if not find_str is None:
-            if not find_str in pathway:
-                continue
-        print('Pathway: ' , pathway )
-        print('Pathway name: ', pathway_name )
-        print('Gene amount : ', len(genes), ' \t Gene name of first: ', genes[0] )
-        print('Gene dump   : ', genes )
-        show_max_nr-=1
-        if show_max_nr == 0:
-            break;
-
-def create_listdictionary_from_file( filename , delimiter = '\t' ,add_key_prefix='' ):
-    wanted_dictionary = dict()
-    with open( filename ) as input :
-        value, keyval, descriptor = "", "", []
-        for line in input:
-            all_vals = line.replace('\n','').replace(' ','').split( delimiter )
-            keyv = add_key_prefix+all_vals[0]
-            valv = all_vals[1:]
-            wanted_dictionary[keyv] = valv
-    return ( wanted_dictionary )
-
-def flatten_generator(pathway_generator_object):
-    for pathway,genes in pathway_generator_object.pathways.items() :
-        ngenes = []
-        [ ngenes.append(g) if 'str' in str(type(g)) else [ ngenes.append(q) for q in g ] for g in genes ]
-        pathway_generator_object.pathways[pathway] = list(set(ngenes))
-        if pathway in pathway_generator_object.pathway_compartments:
-            pathway_generator_object.pathway_names[pathway] = '[' + \
-                       ','.join(list(set( pathway_generator_object.pathway_compartments[pathway] )) ) + \
-                       '] ' + pathway_generator_object.pathway_names[pathway]
-
-if __name__ == '__main__' :
-    print('ADD BACK REACTOME TEST')
-
--- a/build/lib/impetuous/probabilistic.py
+++ b/build/lib/impetuous/probabilistic.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import numpy as np
-import pandas as pd
-
-def hcurve ( x ) :
-    return ( x )
-
-def D ( d , bLinear=False ) :
-    if bLinear :
-        return ( int(np.floor(3.5*d)) )
-    if d==0 :
-        return ( 0 )
-    else :
-        return ( D(d-1) + 2**d  )
-
-def cluster_probability_of_x ( X ,
-                    evidence , labels , centroids ,
-                    K = None , bLinear = False ) :
-
-    ispanda = lambda P: 'pandas' in str(type(P)).lower()
-    BustedPanda = lambda R : R.values if ispanda(R) else R
-    
-    desc_ = """
-    DOING THE SIMPLE PROBABILISTIC MODELLING FOR THE DATA
-    YOU CAN USE THIS TO DO VORONOI TESSELATION WITH K = 1 IN 2D
-    X         IS A LIST OF COORDINATE POINTS TO EVALUATE THE FUNCTION ON
-    EVIDENCE  IS A LIST OF ACTUAL EXPERIMENTAL DATA COORDINATES
-    LABELS    ARE THE IMPUTED CLUSTER LABELS OF THE EXPERIMENTAL DATA POINTS
-    CENTROIDS ARE THE CENTROID COORDINATES
-    K          IS THE NEIGHBOUR DIMENSION
-    BLINEAR    SPECIFIES THAT ...
-
-    labels , centroids = impq.knn_clustering_alignment ( P , Q )
-    evidence = P.values
-
-    THIS IS NOT THE SAIGA YOU ARE LOOKING FOR -RICHARD TJÖRNHAMMAR 
-    """
-    ev_ = BustedPanda ( evidence )
-    N , DIM = np.shape( ev_ )
-    if K is None :
-        K = D ( DIM , bLinear )
-    #
-    # INPUT TYPE CHECKING
-    if True :
-        bFailure = False
-        if not ( 'array' in str(type(labels)) and len(np.shape(labels)) == 2 ) :
-            print("THE LABELS SHOULD BE 2D ARRAY OF DIMENSION N,1 "); bFailure=True
-        if not ( 'array' in str(type(ev_)) and len(np.shape(ev_)) == 2 ) :
-            print("EV_ SHOULD BE 2D ARRAY OF DIMENSION N,DIM "); bFailure=True
-        if not ( 'array' in str(type(centroids)) and len(np.shape(centroids)) == 2 ) :
-            print("CENTROIDS SHOULD BE 2D ARRAY OF DIMENSION M,DIM "); bFailure=True
-        if not ( 'array' in str(type(X)) and len(np.shape(X)) == 2 ) :
-            print("X SHOULD BE 2D ARRAY OF DIMENSION P,DIM "); bFailure=True
-        
-        if bFailure :
-            print( desc_ )
-            exit(1)
-    #        
-    all_cluster_names = sorted(list(set( [ l[0] for l in labels] )))
-    print ( all_cluster_names )
-    #
-    Rdf = None
-    for x in X :
-        S = sorted([ (s,i) for (s,i) in zip( np.sum( (x - ev_)**2 , 1 ) , range(len(ev_)) ) ])[:K]
-        present_clusters = [ labels[s[1]][0] for s in S ]
-        pY = []
-        for yi in all_cluster_names:
-            pY.append( np.sum(yi==present_clusters)/K )
-        tdf = pd.DataFrame( [[*x,*pY]] ,
-                    columns = [ *['d'+str(i) for i in range(len(x))] ,
-                                *all_cluster_names ] )
-        if Rdf is None :
-            Rdf = tdf
-        else :
-            Rdf = pd.concat([Rdf,tdf])
-
-    Rdf.index = range(len(X))
-    probability_df = Rdf.copy()
-
-    return ( probability_df )
-
-import impetuous.quantification as impq
-if __name__ == '__main__' :
-    #
-    # IF YOU REQUIRE THE DATA THEN LOOK IN :
-    # https://github.com/richardtjornhammar/RichTools
-    # WHERE YOU CAN FIND THE FILES USED HERE
-    #
-    if True :        
-        print( D(1),D(2),D(3),D(4) )
-        print( D(1,True),D(2,True),D(3,True),D(4,True) )
-
-        colors = {'H':'#777777','C':'#00FF00','N':'#FF00FF','O':'#FF0000','P':'#FAFAFA'}
-        Q = read_xyz( name='data/naj.xyz'   , header=2 , sep=' ' )
-        P = read_xyz( name='data/cluster0.xyz' , header=2 , sep='\t' ) # IF IT FAILS THEN CHECK THE FORMATING OF THIS FILE
-
-        if True :
-            labels , centroids = impq.knn_clustering_alignment ( P , Q )
-            pd.DataFrame(  labels  ) .to_csv( 'labels.csv'   ,'\t' )
-            pd.DataFrame( centroids).to_csv( 'centroids.csv','\t' )
-        else :
-            labels = pd.read_csv('labels.csv','\t',index_col=0).values
-            centroids = pd.read_csv('centroids.csv','\t',index_col=0).values
-
-        p = cluster_probability_of_x ( X = np.array( [ [-20.5, 24.6 , 84] , [10,10,10] ] ) ,
-                            evidence  = P.values  , 
-                            labels    = labels.reshape(-1,1)    ,
-                            centroids = centroids ,
-                            K = None )
-        print ( p )
--- a/build/lib/impetuous/quantification.py
+++ b/build/lib/impetuous/quantification.py
--- a/build/lib/impetuous/reducer.py
+++ b/build/lib/impetuous/reducer.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-#
-# MOVED OVER FROM RANKOR (reducer.py) AS OF COMMIT :
-# https://github.com/richardtjornhammar/rankor/commit/f04a83091a92130b9b73439b4ad65b5be3056cf9
-#
-import pandas as pd
-import numpy as np
-from scipy.stats import rankdata
-from impetuous.quantification import qvalues, permuter
-#
-# REGULAR CONTRAST
-contrast   = lambda A,B : ( A-B )/( A+B )
-#
-# OTHER
-e_flatness = lambda x   : np.exp(np.mean(np.log(x),0))/np.mean(x,0)
-e_contrast = lambda x   : 1 - e_flatness(x)
-
-pi0 = lambda pvs : 1.
-
-def padded_rolling_average( tv , tau ) :
-	# AS OF THIS PANDAS VERSION ( 1.1.0 )
-	# WINDOW CALCULATION WAS NOT PADDED SO
-	# THIS IS A NUMPY ONLY IMPLEMENTATION
-	if tau==1 :
-		return ( tv )
-	if len(tv)<tau :
-		return ( [ np.mean(v_) for v_ in tv ] )
-	centered = lambda x:(x[0],x[1]) ; N=len(tv);
-	w = int(np.floor(np.abs(tau)*0.5)) ;
-	jid = lambda i,w,N:[int((i-w)>0)*((i-w)%N),int(i+w<N)*((i+w)%N)+int(i+w>=N)*(N-1)]
-	idx = [ centered( jid(i,w,N) ) for i in range(N) ]
-	mvalues = [ np.mean(tv[i[0]:i[1]]) for i in idx ]
-	return ( mvalues )
-
-def svd_reduced_mean ( x,axis=0,keep=[0] ) :
-    if True :
-        sk = set ( keep )
-        if len ( np.shape(x) ) > 1 :
-            u , s , vt = np .linalg .svd( x , full_matrices=False )
-            xred = np.mean( np.dot(u*[s[i_] if i_ in sk else 0 for i_ in range(len(s))],vt) , axis)
-            if 'pandas' in str(type(x)) :
-                if not 'series' in str(type(x)) :
-                    xname = x.index.values[0]
-                    return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) )
-                else :
-                    xname = x.name
-                    return ( pd.Series( xred , name=xname , index=x.columns ) )
-            else :
-                return ( xred )
-    return ( x )
-
-from sklearn.decomposition import PCA
-dimred = PCA ( n_components = 1 )
-def pca_reduced_mean( x ) :
-    if True :
-        if len ( np.shape(x) ) > 1 :
-            Xnew = dimred.fit_transform( x.T )
-            xred = Xnew . T [0] + np.mean(np.mean(x))
-            if 'pandas' in str(type(x)) :
-                if not 'series' in str(type(x)) :
-                    xname = x.index.values[0]
-                    return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) )
-                else :
-                    xname = x.name
-                    return ( pd.Series( xred , name=xname , index=x.columns ) )
-    return ( x )
-
-def reduction ( a , power , centered=-1 ) :
-    if centered>0 :
-        a = ( a.T-np.mean(a,1) ).T
-    return(  np.linalg.svd ( a**power , full_matrices=False ) )
-
-def hyper_params ( df_ , label = 'generic' , sep = ',' , power=1., centered=-1 ):
-    #
-    idx_ = df_.index.values
-    N_s = len ( df_.columns )
-    u,s,vt = reduction( df_.values , power , centered=centered )
-    rdf_ = pd.Series ( np.sum(u**2,1) , index=idx_ , name = label+sep+"u" )
-    rdf_ = pd.concat ( [ pd.DataFrame(rdf_) ,
-                         pd.DataFrame( pd.Series( np.mean( df_.values,1 ) ,
-                                                  index = idx_ , name=label+sep+"m") ) ] , axis=1 )
-    w_ = rdf_ .loc[ :,label+sep+"u" ].values
-    r_ = rankdata ( [ w for w in w_ ] ,'average' )
-    N = len ( r_ )
-    #
-    df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ ).T
-    #
-    from scipy.special import erf as erf_
-    loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ]
-    lvs      = np.log( df0_.loc[ :,'w_'].values )
-    #
-    return ( np.mean(lvs) , np.std(lvs) )
-
-
-def hyper_rdf ( df_ , label = 'generic' , sep = ',' , power=1. ,
-                diagnostic_output = False , MEAN=None , STD=None , centered=-1 ) :
-    #
-    idx_= df_.index.values
-    N_s = len ( df_.columns )
-    u,s,vt = reduction ( df_.values , power , centered=centered )
-    rdf_ = pd.Series ( np.sum( ( u )**2,1 ) , index=idx_ , name = label+sep+"u" )
-    rdf_ = pd.concat ( [ pd.DataFrame( rdf_ ) ,
-                         pd.DataFrame( pd.Series( np.mean( df_.values,1 ) ,
-                                                  index = idx_ , name=label+sep+"m") ) ] , axis=1 )
-    w_ = rdf_ .loc[ :,label+sep+"u" ].values
-    r_ = rankdata ( [ w for w in w_] ,'average' )
-    N  = len ( r_ )
-    #
-    # HERE WE CONSTRUCT A DIANGOSTICS AND INTERMITTENT CALCULATION
-    # DATAFRAME FOR EASIER DEBUGGING AND INSPECTION
-    df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ )
-    df0_ = df0_.T.sort_values ( by='r_' )
-    df0_ .loc[ : , 'd_'  ] = [ v for v in ( df0_.loc[:, 'w_' ] * df0_.loc[:,'r_'] ) ]
-    df0_ .loc[ : , 'da_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values  )
-    #
-    # HOW MANY EQUALLY OR MORE EXTREME VALUES ARE THERE? ( RANK BASED )
-    df0_ .loc[ : , 'dt_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values[::-1] )[::-1]
-    df0_ .loc[ : , 'rank_ps' ] = df0_ .loc[ :,'dt_' ] / np.sum( df0_ .loc[ :,'d_' ] )
-    #
-    # DRAW HISTOGRAM TO SEE DISTRIBUTION OF THE DISTANCES
-    from scipy.special import erf as erf_
-    loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ]
-    lvs      = np.log( df0_.loc[ :,'w_'].values )
-    if MEAN is None or STD is None:
-        if len(lvs)>3 :
-            ps = loc_pval( lvs , np.mean( lvs ) , np.std( lvs ) )
-        else :
-            ps = df0_ .loc[ : , 'rank_ps' ]
-    else :
-        ps = loc_pval( lvs , MEAN , STD )
-    #
-    if diagnostic_output :
-        import scipy.stats       as scs
-        NB    = 100
-        lv    = np.log(  rdf_.loc[:,[label+sep+"u"]].values  )
-        y , x = np.histogram( lv , bins=NB , density=True )
-        skw = scs.skew(lv)[0]
-        kur = scs.kurtosis(lv)[0]
-        shape_stats = "kurtosis: " + "{:.2f} ".format( kur ) + "skewness: "+ "{:.2f}".format( skw )
-        locd  = lambda x,M,s : (1./s/np.sqrt(2.*np.pi))*np.exp(-0.5*((x-M)/s)**2 )
-        lin_x = 0.5 * ( x[1:] + x[:-1] )
-        his_y = y
-        rsy = sorted( [ (y_,x_) for (y_,x_) in zip(y,lin_x) ] )[::-1]
-        hm , hs = np.mean([rsy[i][1] for i in range(5)]) , np.mean([rsy[i][0] for i in range(5)])
-        h_mod_y = locd( lin_x , hm , 1.0/(hs*np.sqrt(2.*np.pi)) )
-        d_mod_y = locd( lv , np.mean(lv) , np.std(lv) )
-        rem_y = [ (y_-m_) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ]
-        prc_y = [ 100.*np.abs( contrast(y_,m_) ) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ]
-        RMSD  = np.sqrt(np.sum([ ry**2 for ry in rem_y ]))
-        PMAE  = np.mean(prc_y)
-    #
-    df0_ .loc[ : , 'pvalues' ] = ps
-    #
-    # ASSIGN THE P VALUES
-    rdf_.loc[df0_.index.values,label+sep+"p"] = df0_.loc[:,'pvalues']
-    rdf_.loc[df0_.index.values,label+sep+"q"] = [ qvs[0] for qvs in qvalues( df0_.loc[:,'pvalues'].values , pi0 = pi0(df0_.loc[:,'pvalues'].values) ) ]
-    rdf_.loc[df0_.index.values,label+sep+"r"] = df0_ .loc[ : , 'rank_ps' ]
-    #
-    # AND RETURN THE RESULTS
-    if diagnostic_output :
-        return ( rdf_ , RMSD , PMAE , kur , skw , rem_y )
-    else :
-        return ( rdf_ )
-
-
-if __name__ == '__main__' :
-    #
-    print ( 'REDUCER :: TESTS ' )
-    #
-    a   = 2*np.random.rand(10)
-    b   = 4*np.random.rand(10)
-    X   = [ [*(a[:5]+1),*a[5:]],[*(b[:5]+3),*(b[5:])] ]
-    Xdf = pd.DataFrame( X , columns=['a','b','s','c','d','e','f','g','h','i'] , index=['i0','i1'])
-    #
-    print ( Xdf )
-    print ( svd_reduced_mean ( X ) )
-    print ( svd_reduced_mean ( Xdf ) )
-    v0 = [1.9,1.8,2.1,1.1,8.,1.2,2.2,3.5,2.0,2.0,3.1,2.1,2.9]
-    a2 = np.array([[2,2,2,1,1,1,2,3,2,2,3,2,3],v0])
-    NW = 100
-    if False:  
-        for i in range(NW):
-            for w in range(1,i-1):
-                dat = np.random.rand(i)
-                pra = padded_rolling_average( dat,tau=w )
-                str_pra = ''.join([ str(p) for p in pra ])
-                if 'nan' in str_pra:
-                    print ( pra,len(pra),'[',i,w ,']')
-                else:
-                    if not len(pra)==len(dat):
-                        print ( len(pra)==len(dat),'[',i,w ,']',len(pra),len(dat) )
-                        
-    print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(-1,1) ,tau=4 ) )
-    print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(1,-1) ,tau=4 ) )
-    print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ) ,tau=4 ) )
-    print ( padded_rolling_average( v0,tau=4 ) )
-    print ( v0 )
--- a/build/lib/impetuous/spectral.py
+++ b/build/lib/impetuous/spectral.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import pandas as pd
-import numpy as np
-from scipy.stats import rankdata
-
-beta2M = lambda b : math.log2( (b/(1-b)) )
-M2beta = lambda M : 2**M/(2**M+1)
-#
-#	FRACTIONAL RANKS ON [0,1] ARE RECAST TO +- INFINITY
-map_df_to_spectral_order_domain = lambda df: \
-	df .apply ( lambda x:(rankdata(x,'average')-0.5)/len(x) ) \
-           .apply ( lambda B:[ beta2M(b) for b in B] )
-
-power = lambda s : np.dot( s.T,np.conj(s) )
-#
-#	THE SPECTRUM IS TRANSFORMED INTO A POWER SPECTRUM
-spectre_to_power = lambda sdf: \
-	pd.DataFrame ( \
-		power ( sdf.T.apply(np.fft.fft) ) , \
-			index = sdf.index , columns = sdf.index \
-		)
-#
-#	THE BELOW METHOD IS HIGHLY EXPERIMENTAL
-def transform_to_resonances( spectrum ) :
-    # assumes spectrum is a square symmetric matrix
-    f_ls = np.mean( np.abs(spectrum.iloc[0,:].quantile([0.01,0.99])) )
-    reso = spectrum .apply(lambda ser: np.real(ser.to_numpy())) \
-                .apply(lambda X:[x/f_ls for x in X]) .apply(M2beta) \
-                .apply(lambda X:[ 2.*x-1. for x in X] )
-    return ( reso )
-#
-#	TIME AUTOCORRELATIONS ARE NOT THE SAME THING
-#	AS PEARSON AUTOCORRELATIONS
-def calc_autocorrelation( tv , dt=1 ,bMeanCentered=True ) :
-    # If you studied statistical mechanics you would
-    # know about the Wiener Kinchin theorem
-    if bMeanCentered :
-        # So for stocks you might want
-        # to remove the mean
-        tv-=np.mean(tv)
-    ftv = np.fft.fft(tv)
-    rt  = np.fft.ifft(ftv*np.conj(ftv))
-    rt  = rt/rt[0]
-    rt  = rt[:int(np.floor(len(rt)*0.5))]
-    return( [dt*t for t in range(len(rt))], [np.real(r) for r in rt] )
-
-if __name__ == '__main__' :
-    print ( 'SORRY: NO TESTS HERE' )
-    print ( 'DEVELOPMENTAL VERSION')
--- a/build/lib/impetuous/visualisation.py
+++ b/build/lib/impetuous/visualisation.py
-"""
-Copyright 2020 RICHARD TJÖRNHAMMAR
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import numpy as np
-import pandas as pd
-import bokeh
-
-description="""Rudimentary plotting wrappers for impetuous using bokeh"""
-
-run_in_notebook="""
-    from bokeh.plotting import output_notebook, show
-    output_notebook()"""
-
-typecheck = lambda x,typ: typ in str(type(x))
-list_typecheck = lambda xl,typ, func: func( [ typecheck(x,typ) for x in xl ] )
-
-nice_colors = list( set( [ '#c5c8c6' , '#1d1f21' , '#282a2e' , '#373b41' , '#a54242' , '#cc6666' ,
-                '#8c9440' , '#b5bd68' , '#de935f' , '#f0c674' , '#5f819d' , '#81a2be' ,
-                '#85678f' , '#b294bb' , '#5e8d87' , '#8abeb7' , '#707880' ] ) )
-
-make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
-
-def bscatter ( X , Y , additional_dictionary=None , title='' , color='#ff0000' , p=None, legend_label = None , alpha=1 , axis_labels = None ) :
-    from bokeh.plotting import figure, output_file, ColumnDataSource
-    from bokeh.models   import HoverTool, Range1d, Text
-    from bokeh.models   import Arrow, OpenHead, NormalHead, VeeHead, Line
-    #
-    if 'str' in str(type(color)):
-        colors_ = [ color for v in X ]
-    else :
-        colors_ = color
-
-    if 'list' in str(type(alpha)):
-        alphas_ = alpha
-    else :
-        alphas_ = [ alpha for v in X ]
-        
-    data = { **{'x' : X , 'y' : Y ,
-                'color': colors_ ,
-                'alpha': alphas_ } }
-    ttips = [   ("index "  , "$index"   ) ,
-                ("(x,y) "  , "(@x, @y)" ) ]
-    
-    if not additional_dictionary is None :
-        if 'dict' in str(type(additional_dictionary)):
-            data = {**data , **additional_dictionary }
-            for key in additional_dictionary.keys() :
-                ttips.append( ( str(key) , '@'+str(key) ))
-    
-    source = ColumnDataSource ( data = data )
-    hover = HoverTool ( tooltips = ttips )
-    #
-    if p is None :
-        p = figure ( plot_width=600 , plot_height=600 , 
-           tools = [hover,'box_zoom','wheel_zoom','pan','reset','save'],
-           title = title )
-        
-    if legend_label is None :
-        p.circle( 'x' , 'y' , size=12, source=source , color='color', alpha='alpha' )
-    else :
-        p.circle( 'x' , 'y' , size=12, source=source , color='color', alpha='alpha' , legend_label=legend_label )
-
-    p.xaxis.axis_label = axis_labels[ 0] if not axis_labels is None else 'x'
-    p.yaxis.axis_label = axis_labels[-1] if not axis_labels is None else 'y'
-    p.output_backend = 'webgl'
-    
-    return( p )
-
-
-def plotter ( x = np.random.rand(10) , y = np.random.rand(10) , colors = '#ff0000' ,
-             legends=None, axis_labels = None, bSave = False, name='scatter.html' ):
-
-    from bokeh.plotting import output_file, show, save
-    
-    output_file( name )
-    outp = lambda x: save if bSave else show
-
-    if list_typecheck([x,y,colors],'list',all) :
-        p = bscatter(  [0] , [0] , color = "#ffffff" , alpha=0 )
-        for i in range(len(x)) :
-            x_ , y_ , color = x[i] , y[i] , colors[i]
-            if list_typecheck([legends,axis_labels],'list',all):
-                label = legends[i]
-                p = bscatter(  x_ , y_ , color = color , p = p , legend_label = label , axis_labels=axis_labels )           
-            else :
-                p = bscatter(  x_ , y_ , color = color , p = p )
-        outp ( p )
-    else :
-        p = bscatter( x,y, color=colors )
-        outp ( p )
-    return( p )
--- a/dist/impetuous-gfa-0.36.3.tar.gz
+++ b/dist/impetuous-gfa-0.36.3.tar.gz
--- a/dist/impetuous_gfa-0.36.3-py3-none-any.whl
+++ b/dist/impetuous_gfa-0.36.3-py3-none-any.whl
--- a/impetuous_gfa.egg-info/PKG-INFO
+++ b/impetuous_gfa.egg-info/PKG-INFO
-Metadata-Version: 2.1
-Name: impetuous-gfa
-Version: 0.36.3
-Summary: Impetuous Quantification, a Statistical Learning library for Humans : Alignments, Clustering, Enrichments and Group Analysis
-Home-page: https://github.com/richardtjornhammar/impetuous
-Author: Richard Tjörnhammar
-Author-email: richard.tjornhammar@gmail.com
-License: UNKNOWN
-Description: # A Statistical Learning library for Humans
-        Decomposes a set of expressions into a group expression. The toolkit currently offers enrichment analysis, hierarchical enrichment analysis, PLS regression, Shape alignment or clustering as well as  rudimentary factor analysis.
-        
-        The expression regulation can be studied via a statistical test that relates it to the observables in the journal file. The final p values are then FDR corrected and the resulting adjusted p values are produced.
-        
-        Visit the active code via :
-        https://github.com/richardtjornhammar/impetuous
-        
-        Visit the published code : 
-        https://doi.org/10.5281/zenodo.2594690
-        
-        Cite using :
-        DOI: 10.5281/zenodo.2594690
-        
-        # Pip installation with :
-        ```
-        pip install impetuous-gfa
-        ```
-        
-        # Version controlled installation of the Impetuous library
-        
-        The Impetuous library
-        
-        In order to run these code snippets we recommend that you download the nix package manager. Nix package manager links from Oktober 2020:
-        
-        https://nixos.org/download.html
-        
-        ```
-        $ curl -L https://nixos.org/nix/install | sh
-        ```
-        
-        If you cannot install it using your Wintendo then please consider installing Windows Subsystem for Linux first:
-        
-        ```
-        https://docs.microsoft.com/en-us/windows/wsl/install-win10
-        ```
-        
-        In order to run the code in this notebook you must enter a sensible working environment. Don't worry! We have created one for you. It's version controlled against python3.7 and you can get the file here:
-        
-        https://github.com/richardtjornhammar/rixcfgs/blob/master/code/environments/impetuous-shell.nix
-        
-        Since you have installed Nix as well as WSL, or use a Linux (NixOS) or bsd like system, you should be able to execute the following command in a termnial:
-        
-        ```
-        $ nix-shell impetuous-shell.nix
-        ```
-        
-        Now you should be able to start your jupyter notebook locally:
-        
-        ```
-        $ jupyter-notebook impetuous_finance.ipynb
-        ```
-        
-        and that's it.
-        
-        # Usage example 1 : elaborate informatics example
-        
-        code: https://gitlab.com/stochasticdynamics/eplsmta-experiments
-        docs: https://arxiv.org/pdf/2001.06544.pdf
-        
-        # Usage example 2 : simple code example
-        
-        Now while in a good environment: In your Jupyter notebook or just in a dedicated file.py you can write the following:
-        
-        ```
-        import pandas as pd
-        import numpy as np
-        
-        import impetuous.quantification as impq
-        
-        adf = pd.read_csv( 'analytes.csv' , '\t' , index_col=0 )
-        jdf = pd.read_csv( 'journal.csv'  , '\t' , index_col=0 )
-        
-        res_dfs = impq.run_rpls_regression ( adf , jdf , 'S ~ C(industry)' , owner_by = 'angle' )
-        
-        print ( res_dfs )
-        ```
-        
-        # Manually updated code backups for this library :
-        
-        GitLab:	https://gitlab.com/richardtjornhammar/impetuous
-        CSDN:	https://codechina.csdn.net/m0_52121311/impetuous
-        
-        
-Platform: UNKNOWN
-Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
-Description-Content-Type: text/markdown
--- a/impetuous_gfa.egg-info/SOURCES.txt
+++ b/impetuous_gfa.egg-info/SOURCES.txt
-README.md
-setup.py
-impetuous_gfa.egg-info/PKG-INFO
-impetuous_gfa.egg-info/SOURCES.txt
-impetuous_gfa.egg-info/dependency_links.txt
-impetuous_gfa.egg-info/top_level.txt
-src/impetuous/__init__.py
-src/impetuous/clustering.py
-src/impetuous/convert.py
-src/impetuous/fit.py
-src/impetuous/hierarchal.py
-src/impetuous/impetuous.py
-src/impetuous/pathways.py
-src/impetuous/probabilistic.py
-src/impetuous/quantification.py
-src/impetuous/reducer.py
-src/impetuous/spectral.py
-src/impetuous/visualisation.py
\ No newline at end of file
--- a/impetuous_gfa.egg-info/dependency_links.txt
+++ b/impetuous_gfa.egg-info/dependency_links.txt
-
--- a/impetuous_gfa.egg-info/top_level.txt
+++ b/impetuous_gfa.egg-info/top_level.txt
-impetuous