提交 3f910780 编写于 作者: rictjo's avatar rictjo

edit

上级 fc5506eb
......@@ -79,5 +79,6 @@ print ( res_dfs )
# Manually updated code backups for this library :
GitLab: https://gitlab.com/richardtjornhammar/impetuous
CSDN: https://codechina.csdn.net/m0_52121311/impetuous
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
import sys
import sklearn.cluster as sc
try :
from numba import jit
bUseNumba = True
except ImportError :
bUseNumba = False
# THE FOLLOWING KMEANS ALGORITHM IS THE AUTHOR OWN LOCAL VERSION
if bUseNumba :
@jit(nopython=True)
def seeded_kmeans( dat, cent ):
#
# PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN
# https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc
# AROUND LINE 2345
# AGAIN CONSIDER USING THE C++ VERSION SINCE IT IS ALOT FASTER
# HERE WE SPEED IT UP USING NUMBA IF THE USER HAS IT INSTALLED AS A MODULE
#
NN , MM = np.shape ( dat )
KK , LL = np.shape ( cent )
if not LL == MM :
print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' )
labels = [ int(z) for z in np.zeros(NN) ]
w = labels
counts = np.zeros(KK)
tmp_ce = np.zeros(KK*MM).reshape(KK,MM)
old_error , error , TOL = 0. , 1. , 1.0E-10
while abs ( error - old_error ) > TOL :
old_error = error
error = 0.
counts = counts * 0.
tmp_ce = tmp_ce * 0.
# START BC
for h in range ( NN ) :
min_distance = 1.0E30
for i in range ( KK ) :
distance = np.sum( ( dat[h]-cent[i] )**2 )
if distance < min_distance :
labels[h] = i
min_distance = distance
tmp_ce[labels[h]] += dat[ h ]
counts[labels[h]] += 1.0
error += min_distance
# END BC
for i in range ( KK ) :
if counts[i]>0:
cent[i] = tmp_ce[i]/counts[i]
centroids = cent
return ( labels, centroids )
else :
def seeded_kmeans( dat, cent ):
#
# SLOW SLUGGISH KMEANS WITH A DUBBLE FOR LOOP
# IN PYTHON! WOW! SUCH SPEED!
#
NN , MM = np.shape ( dat )
KK , LL = np.shape ( cent )
if not LL == MM :
print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' )
labels = [ int(z) for z in np.zeros(NN) ]
w = labels
counts = np.zeros(KK)
tmp_ce = np.zeros(KK*MM).reshape(KK,MM)
old_error , error , TOL = 0. , 1. , 1.0E-10
while abs ( error - old_error ) > TOL :
old_error = error
error = 0.
counts = counts * 0.
tmp_ce = tmp_ce * 0.
# START BC
for h in range ( NN ) :
min_distance = 1.0E30
for i in range ( KK ) :
distance = np.sum( ( dat[h]-cent[i] )**2 )
if distance < min_distance :
labels[h] = i
min_distance = distance
tmp_ce[labels[h]] += dat[ h ]
counts[labels[h]] += 1.0
error += min_distance
# END BC
for i in range ( KK ) :
if counts[i]>0:
cent[i] = tmp_ce[i]/counts[i]
centroids = cent
return ( labels, centroids )
from scipy.spatial.distance import squareform , pdist
absolute_coordinates_to_distance_matrix = lambda Q:squareform(pdist(Q))
distance_matrix_to_geometry_conversion_notes = """
*) TAKE NOTE THAT THE OLD ALGORITHM CALLED DISTANCE GEOMETRY EXISTS. IT CAN BE EMPLOYED TO ANY DIMENSIONAL DATA. HERE YOU FIND A SVD BASED ANALOG OF THAT OLD METHOD.
*) PDIST REALLY LIKES TO COMPUTE SQUARE ROOT OF THINGS SO WE SQUARE THE RESULT IF IT IS NOT SQUARED.
*) IN SHORT THE DISTANCE MATRIX IN THE CONVERSION ROUTINE BACK TO ABSOLUTE COORDINATES USES R2 DISTANCES.
"""
if bUseNumba :
@jit(nopython=True)
def distance_matrix_to_absolute_coordinates ( D , bSquared = False, n_dimensions=2 ):
if not bSquared :
D = D**2.
DIM = n_dimensions
DIJ = D*0.
M = len(D)
for i in range(M) :
for j in range(M) :
DIJ[i,j] = 0.5* (D[i,-1]+D[j,-1]-D[i,j])
D = DIJ
U,S,Vt = np.linalg.svd ( D , full_matrices = True )
S[DIM:] *= 0.
Z = np.diag(S**0.5)[:,:DIM]
xr = np.dot( Z.T,Vt )
return ( xr )
else :
def distance_matrix_to_absolute_coordinates ( D , bSquared = False, n_dimensions=2 ):
if not bSquared :
D = D**2.
DIM = n_dimensions
DIJ = D*0.
M = len(D)
for i in range(M) :
for j in range(M) :
DIJ[i,j] = 0.5* (D[i,-1]+D[j,-1]-D[i,j])
D = DIJ
U,S,Vt = np.linalg.svd ( D , full_matrices = True )
S[DIM:] *= 0.
Z = np.diag(S**0.5)[:,:DIM]
xr = np.dot( Z.T,Vt )
return ( xr )
def connectivity ( B , val, bVerbose=True ) :
description="""
This is a cutoff based clustering algorithm. The intended use is to supply a distance matrix and a cutoff value (then becomes symmetric positive definite). For a small distance cutoff, you should see all the parts of the system and for a large distance cutoff, you should see the entire system. It has been employed for statistical analysis work as well as the original application where it was employed to segment molecular systems.
"""
# PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN
# https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc
# AROUND LINE 2277
# CONSIDER COMPILING AND USING THAT AS A MODULE INSTEAD OF THIS SINCE IT IS
# A LOT FASTER
# FOR A DESCRIPTION READ PAGE 30 (16 INTERNAL NUMBERING) of:
# https://kth.diva-portal.org/smash/get/diva2:748464/FULLTEXT01.pdf
#
nr_sq,mr_sq = np.shape(B)
if nr_sq != mr_sq :
print ( 'ERROR' )
exit (1)
N = mr_sq
res , nvisi, s, NN, ndx, C = [], [], [], [], [], 0
res .append(0)
for i in range(N) :
nvisi.append(i+1)
res.append(0); res.append(0)
ndx.append(i)
while ( len(ndx)>0 ) :
i = ndx[-1] ; ndx = ndx[:-1]
NN = []
if ( nvisi[i]>0 ) :
C-=1
for j in range(N) :
if ( B[i,j]<val ) :
NN.append(j)
while ( len(NN)>0 ) :
# back pop_back
k = NN[-1]; NN = NN[:-1]
nvisi[k] = C
for j in range(N):
if ( B[j,k]<val ) :
for q in range(N) :
if ( nvisi[q] == j+1 ) :
NN.append(q)
if bVerbose : # VERBOSE
print("INFO "+str(-1*C) +" clusters" )
Nc = [ 0 for i in range(-1*C) ]
for q in range(N) :
res[ q*2+1 ] = q;
res[ q*2 ] = nvisi[q]-C;
Nc [res[q*2]]+= 1;
if bVerbose :
print ( " "+str(res[q*2])+" "+str(res[2*q+1]) )
for i in range(-1*C) :
print( "CLUSTER " +str(i)+ " HAS " + str(Nc[i]) + " ELEMENTS")
return ( Nc , np.array(res[:-1]).reshape(-1,2) )
clustering_algorithm = None
clustering_algorithm = sc.KMeans(10) # CHOOSE SOMETHING YOU LIKE NOT THIS
class Cluster(object):
def __init__( self, nbins=50, nclusters=-1 , use_ranks = False ) :
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from numpy import histogram2d
from scipy.stats import rankdata
self.use_ranks = use_ranks
self.nclusters = nclusters
self.nbins = nbins
self.histogram2d = histogram2d
self.KMeans = KMeans
self.rankdata = rankdata
self.pca_f = PCA(2)
self.centroids_ = None
self.labels_ = None
self.df_ = None
self.num_index_ = None
self.components_ = None
def approximate_density_clustering( self, df, nbins=None ) :
#
# GENES APPROX 20K OK SO APPROX 50 BINS
# ANALYTES ON ROWS, SAMPLE POINTS ON COLUMNS
if nbins is None :
nbins = self.nbins
self.df_= df
frac_df = df
if self.use_ranks :
frac_df .apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
self.pca_f.fit(frac_df.T.values)
self.components_ = self.pca_f.components_
vals,xe,ye = self.histogram2d(self.pca_f.components_[0],self.pca_f.components_[1],bins=nbins)
mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
svs = np.sqrt(svsx**2+svsy**2)
#
# IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
# SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
hits = vals>mvs+0.5*svs
#
xe_,ye_ = 0.5*(xe[:1]+xe[1:]) , 0.5*(ye[:1]+ye[1:])
idx = np.where(hits); xi,yj = idx[0],idx[1]
centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
if self.nclusters == -1 :
self.nclusters = len ( centroids )
if self.nclusters < len ( centroids ) :
import heapq
from scipy.spatial import distance as distance_
a = distance_.cdist ( centroids, centroids, 'euclidean' )
cent_idx = heapq.nlargest ( self.nclusters, range(len(a)), a.reshape(-1).__getitem__ )
centroids = [ centroids[ idx ] for idx in cent_idx ]
kmeans = self.KMeans(len(centroids),init=np.array(centroids))
kmeans.fit(self.pca_f.components_.T)
centers = np.array(kmeans.cluster_centers_).T
self.labels_ = kmeans.labels_
self.centroids_ = centers
self.analyte_dict_ = { c:[] for c in self.labels_ }
[self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
return ( self.analyte_dict_ )
def write_gmt(self, filename = './cluster_file.gmt' ) :
with open(filename,'w') as of :
for k,v in self.analyte_dict_.items() :
print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of )
class ManifoldClustering ( Cluster ) :
def __init__( self , nbins=50 ) :
from sklearn.cluster import KMeans
from sklearn.manifold import MDS, TSNE
from numpy import histogram2d
from scipy.stats import rankdata
self.nbins = nbins
self.histogram2d = histogram2d
self.KMeans = KMeans
self.rankdata = rankdata
self.mds = MDS ( n_components=2 )
self.tsne = TSNE ( n_components=2 )
self.man = None
self.centroids_ = None
self.labels_ = None
self.df_ = None
self.num_index_ = None
self.components_ = None
def approximate_embedding( self, df, nbins=None , use_tsne=True ) :
self.man = self.tsne
if not use_tsne :
self.man = self.mds
print ( 'WARNING::SLOW AND WASTEFUL' )
if nbins is None :
nbins = self.nbins
self.df_= df
frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
self.components_ = np.array(self.man.fit_transform(frac_df.values)).T
vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins)
mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
svs = np.sqrt( svsx**2 + svsy**2 )
#
# IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
# SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
hits = vals>mvs+0.5*svs
#print(hits,vals)
xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:])
idx = np.where(hits); xi,yj = idx[0],idx[1]
centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
#
kmeans = self.KMeans(len(centroids),init=np.array(centroids))
kmeans.fit(self.components_.T)
centers = np.array(kmeans.cluster_centers_).T
self.labels_ = kmeans.labels_
self.centroids_ = centers
self.analyte_dict_ = { c:[] for c in self.labels_ }
[self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
return ( self.analyte_dict_ )
def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gmt' ) :
labels = ca.fit_predict(df.values)
llabs = [ l for l in labels ]; ulabs=set(llabs)
with open(filename,'w') as of :
for ulab in ulabs :
analytes = df.iloc[llabs==ulab].index.values
print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of )
def projection_knn_assignment ( projected_coords , df , NMaxGuess=-1 , n_dimensions=2 ) :
coords_s = projected_coords.dropna( 0 )
centroid_coordinates = []
for row in df.T :
guess = sorted ( [ (v,i) for (v,i) in zip( df.loc[row].values,df.loc[row].index ) ] ) [::-1][:NMaxGuess]
maxWeights = [ i[1] for i in guess ]
use = df.loc[row,maxWeights]
S = np.sum ( use.values )
S = 1. if S==0 else S
crd = np.dot(use.values,coords_s.loc[use.index.values].values)/S
centroid_coordinates.append(crd)
centroids_df = pd.DataFrame ( centroid_coordinates , index=df.index , columns=[ 'C'+str(i) for i in range(n_dimensions) ] )
labels , centroids = seeded_kmeans( coords_s.values,centroids_df.values )
coords_s.loc[:,'owner'] = centroids_df.iloc[labels].index.values
for i in range(len(centroids.T)) :
centroids_df.loc[:,'E'+str(i) ] = (centroids.T)[i]
return ( centroids_df , coords_s )
def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False ,
output_name = 'feature_clusters_output.csv'
) :
x_pc1 = CLUSTER.components_[0]
y_pc2 = CLUSTER.components_[1]
L_C = len(CLUSTER.centroids_[0])
#
# MAKE CLUSTER COLORS
make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
C0 = [255,255,255] ; cluster_colors = []
#
for i in CLUSTER.labels_ :
C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255))
cluster_colors.append(make_hex_colors(C0_))
if not df is None :
if add_synonyms :
synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \
else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \
else df.index.values[i] for i in range(len(px))]
else :
synonyms = df.index.values
data = []
for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] ,
[cluster_colors[cl] for cl in CLUSTER.labels_] ) :
data.append([x,y,t,cl,co])
clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color'])
if not df is None :
clustering_df.index = df.index.values
clustering_df.to_csv( output_name , '\t' )
return ( clustering_df )
def backprojection_clustering ( analyte_df , bRanked=False , n_dimensions=2 ,
bDoFeatures=True , bDoSamples=True ) :
from scipy.stats import rankdata
if bRanked :
rana_df = analyte_df .apply( lambda x:(rankdata(x,'average')-0.5)/len(x) )
else :
rana_df = analyte_df
dimcrdnames = [ 'd'+str(i) for i in range(n_dimensions) ]
#
# Do backprojection clustering with backprojection
cluster_coords_f = None
if bDoFeatures :
#
dM1 = absolute_coordinates_to_distance_matrix( rana_df.values )
pd.DataFrame(dM1,index=rana_df.index,columns=rana_df.index).to_csv('../data/dM1.tsv','\t')
#
# Project it back onto first two components
max_var_projection = distance_matrix_to_absolute_coordinates ( dM1 , n_dimensions=n_dimensions )
cluster_coords_f = pd.DataFrame( max_var_projection ,
columns = rana_df.index ,
index = dimcrdnames ).T
cluster_coords_s = None
if bDoSamples :
#
# And again for all the samples
dM2 = absolute_coordinates_to_distance_matrix( rana_df.T.values )
pd.DataFrame(dM2,index=rana_df.columns,columns=rana_df.columns).to_csv('../data/dM2.tsv','\t')
#
# This algorithm is exact but scales somewhere between n^2 and n log n
max_var_projection = distance_matrix_to_absolute_coordinates ( dM2 , n_dimensions=n_dimensions )
cluster_coords_s = pd.DataFrame( max_var_projection ,
columns = rana_df.columns ,
index = dimcrdnames ).T
cluster_coords_s.to_csv('../data/conclust_s.tsv','\t')
return ( cluster_coords_f,cluster_coords_s )
if __name__ == '__main__' :
if False :
#
# TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
filename = './Diabetes_collapsed_symbols.gct'
df_ = pd.read_csv(filename,'\t',index_col=0,header=2)
ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]]
ddf .index = [idx.split('/')[0] for idx in ddf.index]
run_clustering_and_write_gmt( ddf , clustering_algorithm )
#
CLU = Cluster( )
CLU .approximate_density_clustering(ddf)
CLU .write_gmt()
if False :
A = np.array( [ [0.00, 0.10, 0.10, 9.00, 9.00, 9.00],
[0.10, 0.00, 0.15, 9.00, 9.00, 9.00],
[0.10, 0.15, 0.00, 9.00, 9.00, 9.00],
[9.00, 9.00, 9.00, 0.00, 0.10, 0.10],
[9.10, 9.00, 9.00, 0.10, 0.00, 0.15],
[9.10, 9.00, 9.00, 0.10, 0.15, 0.00] ] )
print( connectivity(A,0.01) )
"""
Copyright 2019 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
import json
import sys
def drop_duplicate_indices( df ):
df_ = df.loc[~df.index.duplicated(keep='first')]
return df_
def write_tree( tree , outfile='tree.json' ):
root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ]
o_json = json_graph.tree_data( tree , root )
if not outfile is None:
with open(outfile, 'w') as o_file:
json.dump(o_json, o_file )
return( o_json )
def add_attributes_to_tree ( p_df , tree ):
id_lookup = { rid:lid for (lid,rid) in nx.get_node_attributes(tree,'source').items() }
add_attributes = p_df.columns.values
for attribute in add_attributes:
propd = { id_lookup[idx]:{attribute:val} for (idx,val)
in zip(p_df.index.values,p_df.loc[:,attribute]) if idx in set(id_lookup.keys()) }
nx.set_node_attributes( tree , propd )
return( tree )
def parent_child_to_dag (
relationship_file = './PCLIST.txt' ,
i_p = 0 , i_c = 1
) :
n_df = pd .read_csv ( relationship_file , '\t' )
pair_tuples = [ (p,c) for (p,c) in zip(n_df.iloc[:,i_p],n_df.iloc[:,i_c]) ]
children_of = {} ; all_names = set([])
for ( p,c ) in pair_tuples :
if not 'list' in str(type(c)):
C = [c]
all_names = all_names | set([p]) | set(C)
if p in children_of :
children_of[p] .append(c)
else :
children_of[p] = [c]
G = nx .DiGraph()
G .add_nodes_from ( all_names )
G .add_edges_from ( pair_tuples )
tree = nx.algorithms.dag.dag_to_branching( G )
root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ]
descendants = [ ( idx , nx.algorithms.dag.descendants(G,idx) ) for idx in all_names ]
ancestors = [ ( idx , nx.algorithms.dag.ancestors(G,idx) ) for idx in all_names ]
return ( tree,ancestors,descendants )
def make_pathway_ancestor_data_frame(ancestors):
p_df = None
for k,v in ancestors :
t_df = pd.DataFrame([[','.join(list(v)),len(v)]],index=[k],columns=['DAG,ancestors','DAG,level'])
if p_df is None :
p_df = t_df
else :
p_df = pd.concat([p_df,t_df])
return( p_df )
def normalise_for_apples_and_oranges_stats( X , method='ordinal' ):
X_ = rankdata( X , method=method )/len(X)
return(X_)
def make_group_analytes_unique( grouping_file , delimiter='\t' ):
uniqe_grouping_file = '/'.join(grouping_file.split('/')[:-1]) + '/unique_'+grouping_file.split('/')[-1]
with open( uniqe_grouping_file , 'w' ) as of:
with open( grouping_file ) as input :
for line in input :
vline = line.replace('\n','').split(delimiter)
gid, gdesc, analytes_ = vline[0], vline[1], list(set(vline[2:]))
nvec = [gid,gdesc] ; [ nvec.append(a) for a in analytes_ ]
print ( delimiter.join(nvec) , file = of )
def read_conversions(file_name) :
gene2ens = {} ; non_unique = []
with open( file_name , 'r' ) as infile:
if sys.version_info[0] < 3:
infile.next()
else :
next(infile)
for line in infile:
words = line.strip().split('\t')
if len(words)==2 :
ens, gene = words
if gene in gene2ens:
gene2ens[gene].append(ens)
else :
gene2ens[gene] = [ens]
return gene2ens
def read_gene_ensemble_conversion(file_name):
gene2ens = {} ; non_unique = []
with open(file_name,'r') as infile:
if sys.version_info[0] < 3:
infile.next()
else:
next(infile)
for line in infile:
words = line.strip().split('\t')
if len(words)==2 :
ens, gene = words
if gene in gene2ens:
non_unique.append((gene,ens,gene2ens[gene]))
else :
gene2ens[gene] = ens
if len(non_unique)>0:
print(' WARNING ' )
print( 'FOUND ', len(non_unique), ' NON UNIQUE ENTRIES' )
return gene2ens
def create_synonyms( convert_file , unique_mapping=False ):
# CREATE SYNONYMS
ens2sym , sym2ens = {} , {}
if unique_mapping:
sym2ens = read_gene_ensemble_conversion( convert_file )
ens2sym = { v:k for k,v in sym2ens.items() }
else :
sym2ens_list = read_conversions( convert_file )
ens2sym_list = {}
for s,L in sym2ens_list.items() :
for e in L:
if e in ens2sym_list:
ens2sym_list.append(s)
else:
ens2sym_list[e] = [s]
ens2sym = ens2sym_list
sym2ens = sym2ens_list
return ( ens2sym , sym2ens )
def flatten_dict( s2e ) :
# FORCES FIRST ELEMENT
ndict = {}
for (s,e) in s2e.items() :
if 'list' in str(type(e)) :
ndict[s] = e[0]
else :
ndict[s] = e
return ( ndict )
def convert_rdata_to_dataframe ( filename ) :
#
from rpy2.robjects import r as R
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro
#
print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' )
rd_ = R.load( filename )
if 'matrix' in str( type( R[rd_[0]] ) ).lower() :
column_names = [ R[rd_[0]].colnames ]
index_names = [ R[rd_[0]].rownames ]
else :
column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]]
index_names = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]]
#
pandas2ri.activate()
#
# SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO
# A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING
#
rd = R.load( filename )
raw_df_l = []
if 'ndarray' in str( type( R[rd[0]] ) ).lower() :
[ raw_df_l.append( R[rd[0]] ) ]
else :
[ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ]
full_df_dict = {} ; i_ = 0
for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) :
pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames )
full_df_dict[i_] = pdf
i_ = i_ + 1
pandas2ri.deactivate()
return ( full_df_dict )
import os
if __name__ == '__main__' :
#
bMOFA_data = True
if bMOFA_data :
import os
# os.system('mkdir ../data')
# os.system('wget https://github.com/bioFAM/MOFAdata/blob/master/data/CLL_data.RData')
# os.system('mv CLL_data.RData ../data/.')
df_dict = convert_rdata_to_dataframe( filename = '../data/CLL_data.RData' )
pruned_df = df_dict[2].T.dropna().T
journal_df = df_dict[3].loc[['IGHV'],:]
mask = [ v>=0 for v in journal_df.loc['IGHV'].values ]
journal_df = journal_df.iloc[ :,mask ]
use = list(set(pruned_df.columns)&set(journal_df.columns))
analyte_df = pruned_df .loc[ :,use ].apply( lambda x:np.log2(1+x) )
journal_df = journal_df .loc[ :,use ]
print ( analyte_df,journal_df )
from impetuous.quantification import *
qdf = quantify_groups ( analyte_df , journal_df , 'anova~C(IGHV)' , '~/Work/data/naming_and_annotations/reactome/reactome.gmt' )
print(qdf)
exit(1)
base = '../../../data/'
convert_file = base + 'naming_and_annotations/conv.txt'
ens2sym , sym2ens = create_synonyms( convert_file )
s2e = { **flatten_dict(sym2ens) }
name,col,sep = 'Networks.nfo',0,'\t'
with open(name,'r') as input:
for line in input:
gname = line.split(sep)[col].replace('\n','')
if gname in s2e:
print(gname)
else:
print('MISSING:',gname)
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import numpy as np
import pandas as pd
def read_xyz(name='data/naj.xyz',header=2,sep=' '):
mol_str = pd.read_csv(name,header=header)
P=[]
for i_ in range(len(mol_str.index)):
line = mol_str.iloc[i_,:].values[0]
lsp = [l.replace(' ','') for l in line.split(sep) if len(l)>0]
P.append(lsp)
pdf = pd.DataFrame(P); pdf.index=pdf.iloc[:,0].values ; pdf=pdf.iloc[:,1:4]
return(pdf.apply(pd.to_numeric))
def KabschAlignment( P,Q ):
#
# https://en.wikipedia.org/wiki/Kabsch_algorithm
# C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc
# IN VINCINITY OF LINE 524
#
N,DIM = np.shape( P )
M,DIM = np.shape( Q )
if DIM>N or not N==M :
print( 'MALFORMED COORDINATE PROBLEM' )
exit( 1 )
q0 , p0 = np.mean(Q,0) , np.mean(P,0)
cQ , cP = Q - q0 , P - p0
H = np.dot(cP.T,cQ)
I = np.eye( DIM )
U, S, VT = np.linalg.svd( H, full_matrices=False )
Ut = np.dot( VT.T,U.T )
I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1
ROT = np.dot( VT.T,np.dot(I,U.T) )
B = np.dot(ROT,P.T).T + q0 - np.dot(ROT,p0)
return ( B )
def WeightsAndScoresOf( P , bFA=False ) :
p0 = np.mean( P,0 )
U, S, VT = np.linalg.svd( P-p0 , full_matrices=False )
weights = U
if bFA :
scores = np.dot(S,VT).T
return ( weights , scores )
scores = VT.T
return ( weights , scores )
def ShapeAlignment( P, Q ,
bReturnTransform = False ,
bShiftModel = True ,
bUnrestricted = False ) :
#
# [*] C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc
# FIND SHAPE FIT FOR A SIMILIAR CODE IN THE RICHFIT REPO
#
description = """
A NAIVE SHAPE FIT PROCEDURE TO WHICH MORE SOPHISTICATED
VERSIONS WRITTEN IN C++ CAN BE FOUND IN MY C++[*] REPO
HERE WE WORK UNDER THE ASSUMPTION THAT Q IS THE MODEL
SO THAT WE SHOULD HAVE SIZE Q < SIZE P WITH UNKNOWN
ORDERING AND THAT THEY SHARE A COMMON SECOND DIMENSION
IN THIS ROUTINE THE COARSE GRAINED DATA ( THE MODEL ) IS
MOVED TO FIT THE FINE GRAINED DATA ( THE DATA )
"""
N,DIM = np.shape( P )
M,DIM = np.shape( Q )
W = (N<M)*N+(N>=M)*M
if (DIM>W or N<M) and not bUnrestricted :
print ( 'MALFORMED PROBLEM' )
print ( description )
exit ( 1 )
q0 , p0 = np.mean(Q,0) , np.mean(P,0)
cQ , cP = Q - q0 , P - p0
sQ = np.dot( cQ.T,cQ )
sP = np.dot( cP.T,cP )
H = np.dot(sP.T,sQ)
I = np.eye( DIM )
U, S, VT = np.linalg.svd( H, full_matrices=False )
Ut = np.dot( VT.T,U.T )
I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1
ROT = np.dot( VT.T,np.dot(I,U.T) )
if bReturnTransform :
return ( ROT,q0,p0 )
if bShiftModel :# SHIFT THE COARSE GRAINED DATA
B = np.dot(ROT,Q.T).T +p0 - np.dot(ROT,q0)
else : # SHIFT THE FINE GRAINED DATA
B = np.dot(ROT,P.T).T +q0 - np.dot(ROT,p0)
return ( B )
def low_missing_value_imputation ( fdf , fraction = 0.9 , absolute = 'True' ) :
# THIS SVD BASED IMPUTATION METHOD WAS FIRST WRITTEN FOR THE RANKOR PACKAGE
# ORIGINAL CODE IN https://github.com/richardtjornhammar/rankor/blob/master/src/rankor/imputation.py
#
import numpy as np
#
# fdf is a dataframe with NaN values
# fraction is the fraction of information that should be kept
# absolute is used if the data is positive
#
V = fdf.apply(pd.to_numeric).fillna(0).values
u,s,vt = np.linalg.svd(V,full_matrices=False)
s = np.array( [ s[i_] if i_<np.floor(len(s)*fraction) else 0 for i_ in range(len(s)) ] )
nan_values = np.dot(np.dot(u,np.diag(s)),vt)
if absolute :
nan_values = np.abs(nan_values)
#
# THIS CAN BE DONE BETTER
for j in range(len(fdf.columns.values)):
for i in range(len(fdf.index.values)):
if 'nan' in str(fdf.iloc[i,j]).lower():
fdf.iloc[i,j] = nan_values[i,j]
return ( fdf )
if __name__ == '__main__' :
#
# IF YOU REQUIRE THE DATA THEN LOOK IN :
# https://github.com/richardtjornhammar/RichTools
# WHERE YOU CAN FIND THE FILES USED HERE
#
if True :
colors = {'H':'#777777','C':'#00FF00','N':'#FF00FF','O':'#FF0000','P':'#FAFAFA'}
Q = read_xyz( name='data/naj.xyz' , header=2 , sep=' ' )
if False : # TEST KABSCH ALGORITHM
P = Q .copy()
Q = Q * -1
Q = Q + np.random.rand(Q.size).reshape(np.shape(Q.values))
P_ , Q_ = P.copy() , Q.copy()
P = P_.values
Q = Q_.values
B = KabschAlignment( P,Q )
B = pd.DataFrame( B , index = P_.index.values ); print( pd.concat([Q,B],1))
if True : # TEST MY SHAPE ALGORITHM
P = read_xyz ( name='data/cluster0.xyz' , header=2 , sep='\t' )
P_ , Q_= P.values,Q.values
B_ = ShapeAlignment( P_,Q_ )
B = pd.DataFrame(B_, index=Q.index,columns=Q.columns)
pd.concat([B,P],0).to_csv('data/shifted.xyz','\t')
"""
Copyright 2019 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
from impetuous.quantification import group_significance
from impetuous.convert import *
def pathway_frame_from_file ( filename ,
delimiter = '\t' , item_sep = ',' ) :
pdf = None
with open( filename,'r' ) as input :
for line in input :
lspl = line.replace('\n','').split(delimiter)
analytes_ = lspl[2:]
desc = lspl[1]
iden = lspl[0]
ps = pd.Series( [desc , item_sep.join(analytes_) ] ,
name = iden , index = ['description','analytes'] )
pdf = pd.concat([pdf,pd.DataFrame(ps).T])
return ( pdf )
def create_dag_representation_df ( pathway_file = '../data/GROUPDEFINITIONS.gmt' ,
pcfile = '../data/PCLIST.txt'
) :
pc_list_file = pcfile
tree , ance , desc = parent_child_to_dag ( pc_list_file )
pdf = make_pathway_ancestor_data_frame ( ance )
pdf_ = pathway_frame_from_file( pathway_file )
pdf.index = [v.replace(' ','') for v in pdf.index.values]
pdf_.index= [v.replace(' ','') for v in pdf_.index.values]
dag_df = pd.concat([pdf.T,pdf_.T]).T
return ( dag_df , tree )
def HierarchalEnrichment (
analyte_df , dag_df , dag_level_label = 'DAG,l' ,
ancestors_id_label = 'aid' , id_name = None , threshold = 0.05 ,
p_label = 'C(Status),p', analyte_name_label = 'analytes' ,
item_delimiter = ',' , alexa_elim=False , alternative = 'two-sided'
) :
#
# NEEDS AN ANALYTE SIGNIFICANCE FRAME:
# INCLUDING P VALUES OF ANALYTES
# DAG GRAPH DESCRIPTION FRAME:
# INCLUDING NODE ID, NODE ANALYTES FIELD (SEPERATED BY ITEM DELIMITER)
# INCLUDING ANCESTORS FIELD (SEPERATED BY ITEM DELIMITER)
# DAG LEVEL OF EACH NODE
tolerance = threshold
df = dag_df ; dag_depth = np.max( df[dag_level_label].values )
AllAnalytes = set( analyte_df.index.values ) ; nidx = len( AllAnalytes )
SigAnalytes = set( analyte_df.iloc[ (analyte_df.loc[:,p_label].values < tolerance), : ].index.values )
if len( AllAnalytes ) == len( SigAnalytes ) :
print ( 'THIS STATISTICAL TEST WILL BE NONSENSE' )
print ( 'TRY A DIFFERENT THRESHOLD' )
marked_analytes = {} ; used_analytes = {} ; node_sig = {}
for d in range( dag_depth, 0, -1 ) :
# ROOT IS NOT INCLUDED
filter_ = df [ dag_level_label ] == d
nodes = df.iloc[ [i for i in np.where(filter_)[ 0 ]] ,:].index.values
for node in nodes :
if 'nan' in str(df.loc[node,analyte_name_label]).lower() :
continue
analytes_ = df.loc[node,analyte_name_label].replace('\n','').replace(' ','').split(item_delimiter)
try :
group = analyte_df.loc[[a for a in analytes_ if a in AllAnalytes] ].dropna( axis=0, how='any', thresh=analyte_df.shape[1]/2 ).drop_duplicates()
except KeyError as e :
continue
if node in marked_analytes :
unused_group = group.loc[ list( set(group.index.values)-marked_analytes[node] ) ]
group = unused_group
L_ = len( group ) ; str_analytes=','.join(group.index.values)
if L_ > 0 :
used_analytes[node] = ','.join( group.index.values )
pv,odds = group_significance( group , AllAnalytes=AllAnalytes, SigAnalytes=SigAnalytes , tolerance = threshold , alternative=alternative )
node_sig[node] = pv ; marked_ = set( group.index.values )
ancestors = df.loc[node,ancestors_id_label].replace('\n','').replace(' ','').split(item_delimiter)
if alexa_elim and pv > threshold : # USE ALEXAS ELIM ALGORITHM : IS NOT DEFAULT
continue
for u in ancestors :
if u in marked_analytes :
us = marked_analytes[u]
marked_analytes[u] = us | marked_
else :
marked_analytes[u] = marked_
df['Hierarchal,p'] = [ node_sig[idx] if idx in node_sig else 1. for idx in df.index.values ]
df['Included analytes,ids'] = [ used_analytes[idx] if idx in used_analytes else '' for idx in df.index.values ]
df = df.dropna()
return ( df )
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from impetuous.clustering import *
from impetuous.quantification import *
import sys
if __name__ == '__main__' :
inventors__ = "Richard Tjörnhammar and Edward Tjörnhammar"
"""
Copyright 2019 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
import re
from impetuous.convert import read_conversions, read_gene_ensemble_conversion, create_synonyms
def find_compartment( istr ) :
return ( re.findall( r'\[(.*?)\]', istr ) )
class Group( object ) :
def get_next(self):
pass
## PATHWAY SECTION
class GenericPathway( Group ) :
def __init__( self , path , gene_name_start = "ENSG0" , gene_mapping = None, # GENE ID MAPPING
is_own_pathway = False, list_like = False , add_pathway_prefix='',
gene_pos=0 , pathway_pos=1 , pathway_desc_pos=3, seperator='\t' ) :
self.file = path
self.prefered_genes = gene_name_start
self.pathways , self.pathway_names = {},{}
self.sep = seperator
self.is_own_pathway = is_own_pathway
self.gene_mapping = gene_mapping
self.gene_pos = gene_pos
self.pathway_pos = pathway_pos
self.pathway_desc_pos = pathway_desc_pos
self.list_like = list_like
self.add_pathway_prefix = add_pathway_prefix
self.replace_pair = None
self.internal_table = None
self.generator = None
self.active_read = False
self.restrict_id_to = None
if 'str' in str(type(self.file)):
self.read_and_parse()
else :
self.parse_df()
def add_pathway_synonyms ( self , synonym_dict , prefix='' ) :
for pathway,pathway_name,genes in self.get_generator() :
synonyms = []
for g in genes :
if g in synonym_dict or prefix+g in synonym_dict :
k = prefix * ( not g in synonym_dict ) + g
sdg = synonym_dict[k]
if 'list' in str(type(sdg)) :
[ synonyms.append(s) for s in sdg ]
if 'str' in str(type(sdg)) :
synonyms.append(sdg)
[ self.pathways[pathway].append(s) for s in synonyms ]
def make_gmt_pathway_file ( self , filename , verbose=False , delimiter='\t', gene_mapping=None ):
#if 'None' in str(type( self.generator )) :
# self.generator = self.get_generator()
if not gene_mapping is None:
self.gene_mapping = gene_mapping
if 'str' in str(type(filename)) :
if 'str' in str( type( filename ) ) :
with open( filename, 'w' ) as o_file :
for pathway, pathway_name, genes in self.get_generator() :
row = list() ; row . append ( pathway )
row . append ( pathway_name )
genes_loc = genes
if 'dict' in str(type(self.gene_mapping)) :
genes_loc = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
[ row.append ( gene ) for gene in list(set(genes_loc)) ]
row_str = delimiter.join(row)
print(row_str)
o_file . write( row_str+'\n' )
else :
print ( 'MUST SUPPLY A FILENAME' )
def make_internal_table(self, verbose=False, delimiter='\t', output_file=None ) :
self.internal_table = list()
generator = self.get_generator( )
for pathway, pathway_name, genes in generator :
row = list() ; row . append ( pathway )
row .append ( pathway_name )
[ row .append ( gene ) for gene in genes ]
row_str = delimiter.join(row)
self.internal_table.append( row )
def parse_df(self):
if not self.is_own_pathway:
print('ERROR: OPERATION NOT SUPPORTED')
exit(1)
sfcv = set( self.file.columns.values )
if 'gene' in sfcv and 'pathway' in sfcv and 'description' in sfcv:
print ( self.file.columns )
else:
genes = self.file.index.values
for gene in genes :
pathway = self.add_pathway_prefix + gene
self.pathways[pathway] = [gene]
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys() :
self.pathway_names[pathway] = self.gene_mapping[gene]
else :
self.pathway_names[pathway] = gene
def get_generator_from_df(self):
self.parse_df()
for key in self.pathways.keys():
yield(key,self.pathway_names[key],self.pathways[key])
def read_and_parse(self):
with open(self.file) as input:
pathway, pathway_name, genes = "", "", []
for line in input:
lspl = line.split('\t')
if not 'None' in str(type(self.replace_pair)):
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] )
else:
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] )
pathway_name = lspl[self.pathway_desc_pos]
if self.list_like :
# LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING
# HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS
genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ]
if not 'None' in str(type(self.gene_mapping)) :
renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
genes = renamed_genes
if pathway in self.pathways :
[ self.pathways[pathway].append(gene) for gene in genes ]
else :
self.pathways[pathway] = genes
self.pathway_names[pathway] = pathway_name
else :
if not line.startswith(self.prefered_genes) or len(line)==0:
continue
gene = lspl[ self.gene_pos ]
if self.is_own_pathway :
if gene in self.pathways :
continue;
else:
pway=self.add_pathway_prefix + gene
self.pathways[pway] = [gene]
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
self.pathway_names[pway] = self.gene_mapping[gene]
else:
self.pathway_names[pway] = gene
else :
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
gene = self.gene_mapping[gene]
if pathway in self.pathways:
self.pathways[pathway].append(gene)
else:
self.pathways[pathway] = [gene]
self.pathway_names[pathway] = pathway_name
def dump_pathways(self):
return ( self.pathways,self.pathway_names )
def get_generator( self, verbose=False ):
if self.active_read :
if not self.file is None :
if 'str' in str(type(self.file)):
self.read_and_parse()
else:
self.parse_df()
if verbose :
print( self.dump_pathways() )
for key in self.pathways.keys():
yield( key , self.pathway_names[key] , self.pathways[key] )
class Reactome( GenericPathway ) :
def __init__( self , path , gene_name_start = 'ENSG0' ,pathway_desc_pos=3,
gene_mapping = None, lexical_restrictions = None, # GENE ID MAPPING
is_own_pathway = False, restrict_id_to = None ) :
self.file = path
self.prefered_genes = gene_name_start
self.pathways , self.pathway_names ,self.pathway_compartments = {},{},{}
self.is_own_pathway = is_own_pathway
self.gene_mapping = gene_mapping
self.gene_pos = 0
self.pathway_pos = 1
self.pathway_desc_pos = pathway_desc_pos
self.list_like = False
self.add_pathway_prefix = ''
self.replace_pair = None
self.lexical_restrictions = lexical_restrictions
self.lexical_restriction_category_pos=None
self.skipstr = None
self.pathway_tag = None
self.restrict_id_to = restrict_id_to
self.active_read = False
if 'str' in str(type(self.file)):
self .read_and_parse(keep_str='ENSG0')
else:
self .parse_df()
def read_and_parse( self , keep_str=None ) :
with open( self.file ) as input :
pathway, pathway_name, genes = "", "", []
for line in input :
if not self.skipstr is None :
if line.startswith(self.skipstr):
continue
if not keep_str is None :
if not line.startswith(keep_str):
continue
lspl = line.split('\t')
if ('[' in line) and (']' in line) :
compartment = find_compartment( line )[0]
else :
compartment = ''
if not 'None' in str(type(self.replace_pair)) :
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] )
else :
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] )
if not self.restrict_id_to is None :
if not pathway in self.restrict_id_to:
continue
pathway_name = lspl[ self.pathway_desc_pos ]
if not self.lexical_restrictions is None:
if not np.sum( [ int(lr in pathway_name) for lr in self.lexical_restrictions] )>0 : #or len(compartment)<4
continue
if self.lexical_restriction_category_pos is None :
lex_restrict = pathway_name
else :
lex_restrict = lspl[ self.lexical_restriction_category_pos ]
if self.list_like :
# LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING
# HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS
genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ]
if not 'None' in str(type(self.gene_mapping)) :
renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
genes = renamed_genes
if pathway in self.pathways :
[ self.pathways[pathway].append(gene) for gene in genes ]
else :
self.pathways[pathway] = genes
self.pathway_names[pathway] = pathway_name
self.pathway_compartments[pathway] = compartment
else :
if not line.startswith(self.prefered_genes) or len(line)==0:
continue
gene = lspl[ self.gene_pos ]
if self.is_own_pathway :
if gene in self.pathways :
continue;
else:
pway = self.add_pathway_prefix + gene
self.pathways[pway] = [gene]
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
self.pathway_names[pway] = self.gene_mapping[gene]
else:
self.pathway_names[pway] = gene
else :
if not self.pathway_tag is None:
if not self.pathway_tag in pathway:
continue
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
gene = self.gene_mapping[gene]
if pathway in self.pathways:
self.pathways[pathway].append(gene)
else :
if True : # self.lexical_restrictions is None :
self.pathways[pathway] = [gene]
self.pathway_names[pathway] = pathway_name
else :
if len(set(self.lexical_restrictions)-set(lex_restrict.split()))<len(self.lexical_restrictions):
if len( set(self.lexical_restrictions) & set(lex_restrict.split()) )>0:
self.pathways[pathway] = [gene]
self.pathway_names[pathway] = pathway_name
def add_pathway_synonyms( self , synonym_dict, prefix='' ) :
for pathway, pathway_name, genes in self.get_generator() :
synonyms = []
for g in genes:
if g in synonym_dict or prefix+g in synonym_dict :
k = prefix * ( not g in synonym_dict ) + g
sdg = synonym_dict[ k ]
if 'list' in str(type(sdg)) :
[ synonyms.append(s) for s in sdg ]
if 'str' in str(type(sdg)) :
synonyms.append(sdg)
[ self.pathways[pathway].append(s) for s in synonyms ]
def read_extra_information ( self , file_name , valpos = 0 , keypos = 1 ,
add_value_prefix = '' , required_content = 'HSA' ,
add_desc_pos = None , comp_pos = None ) :
with open ( file_name ) as input :
pathway, genes = "", []
for line in input :
lspl = line.split( '\t' )
pw = lspl[ keypos ]
if required_content in pw :
gn = add_value_prefix + lspl[ valpos ]
if not self.restrict_id_to is None :
if not pw in self.restrict_id_to :
continue
if pw in self.pathways :
self.pathways[pw] .append(gn)
else :
self.pathways[pw] = [gn]
self.pathway_names[ pw ] = ''
if not add_desc_pos is None :
if lspl[add_desc_pos] not in self.pathway_names[pw] :
self.pathway_names[pw] += lspl[add_desc_pos] + ', '
if not comp_pos is None :
if self.pathway_compartments is None :
self.pathway_compartments = {}
if pw not in self.pathway_compartments :
self.pathway_compartments[pw] = [ ''.join(find_compartment(lspl[comp_pos])) ]
else :
self.pathway_compartments[pw] .append( ''.join(find_compartment(lspl[comp_pos])) )
def add_extra_information_from_dictionary( self , dictionary, map_onto_genes=True ):
for pathway in self.pathways :
genes = self.pathways[pathway]
add_these = []
[ [ add_these.append(v) for v in dictionary[g] ] for g in genes if g in dictionary.keys() ]
if len(add_these)>0:
[ self.pathways[pathway].append(a) for a in add_these]
def print_generator( generator , show_max_nr=3 , find_str=None ):
for pathway,pathway_name,genes in generator:
if not find_str is None:
if not find_str in pathway:
continue
print('Pathway: ' , pathway )
print('Pathway name: ', pathway_name )
print('Gene amount : ', len(genes), ' \t Gene name of first: ', genes[0] )
print('Gene dump : ', genes )
show_max_nr-=1
if show_max_nr == 0:
break;
def create_listdictionary_from_file( filename , delimiter = '\t' ,add_key_prefix='' ):
wanted_dictionary = dict()
with open( filename ) as input :
value, keyval, descriptor = "", "", []
for line in input:
all_vals = line.replace('\n','').replace(' ','').split( delimiter )
keyv = add_key_prefix+all_vals[0]
valv = all_vals[1:]
wanted_dictionary[keyv] = valv
return ( wanted_dictionary )
def flatten_generator(pathway_generator_object):
for pathway,genes in pathway_generator_object.pathways.items() :
ngenes = []
[ ngenes.append(g) if 'str' in str(type(g)) else [ ngenes.append(q) for q in g ] for g in genes ]
pathway_generator_object.pathways[pathway] = list(set(ngenes))
if pathway in pathway_generator_object.pathway_compartments:
pathway_generator_object.pathway_names[pathway] = '[' + \
','.join(list(set( pathway_generator_object.pathway_compartments[pathway] )) ) + \
'] ' + pathway_generator_object.pathway_names[pathway]
if __name__ == '__main__' :
print('ADD BACK REACTOME TEST')
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import numpy as np
import pandas as pd
def hcurve ( x ) :
return ( x )
def D ( d , bLinear=False ) :
if bLinear :
return ( int(np.floor(3.5*d)) )
if d==0 :
return ( 0 )
else :
return ( D(d-1) + 2**d )
def cluster_probability_of_x ( X ,
evidence , labels , centroids ,
K = None , bLinear = False ) :
ispanda = lambda P: 'pandas' in str(type(P)).lower()
BustedPanda = lambda R : R.values if ispanda(R) else R
desc_ = """
DOING THE SIMPLE PROBABILISTIC MODELLING FOR THE DATA
YOU CAN USE THIS TO DO VORONOI TESSELATION WITH K = 1 IN 2D
X IS A LIST OF COORDINATE POINTS TO EVALUATE THE FUNCTION ON
EVIDENCE IS A LIST OF ACTUAL EXPERIMENTAL DATA COORDINATES
LABELS ARE THE IMPUTED CLUSTER LABELS OF THE EXPERIMENTAL DATA POINTS
CENTROIDS ARE THE CENTROID COORDINATES
K IS THE NEIGHBOUR DIMENSION
BLINEAR SPECIFIES THAT ...
labels , centroids = impq.knn_clustering_alignment ( P , Q )
evidence = P.values
THIS IS NOT THE SAIGA YOU ARE LOOKING FOR -RICHARD TJÖRNHAMMAR
"""
ev_ = BustedPanda ( evidence )
N , DIM = np.shape( ev_ )
if K is None :
K = D ( DIM , bLinear )
#
# INPUT TYPE CHECKING
if True :
bFailure = False
if not ( 'array' in str(type(labels)) and len(np.shape(labels)) == 2 ) :
print("THE LABELS SHOULD BE 2D ARRAY OF DIMENSION N,1 "); bFailure=True
if not ( 'array' in str(type(ev_)) and len(np.shape(ev_)) == 2 ) :
print("EV_ SHOULD BE 2D ARRAY OF DIMENSION N,DIM "); bFailure=True
if not ( 'array' in str(type(centroids)) and len(np.shape(centroids)) == 2 ) :
print("CENTROIDS SHOULD BE 2D ARRAY OF DIMENSION M,DIM "); bFailure=True
if not ( 'array' in str(type(X)) and len(np.shape(X)) == 2 ) :
print("X SHOULD BE 2D ARRAY OF DIMENSION P,DIM "); bFailure=True
if bFailure :
print( desc_ )
exit(1)
#
all_cluster_names = sorted(list(set( [ l[0] for l in labels] )))
print ( all_cluster_names )
#
Rdf = None
for x in X :
S = sorted([ (s,i) for (s,i) in zip( np.sum( (x - ev_)**2 , 1 ) , range(len(ev_)) ) ])[:K]
present_clusters = [ labels[s[1]][0] for s in S ]
pY = []
for yi in all_cluster_names:
pY.append( np.sum(yi==present_clusters)/K )
tdf = pd.DataFrame( [[*x,*pY]] ,
columns = [ *['d'+str(i) for i in range(len(x))] ,
*all_cluster_names ] )
if Rdf is None :
Rdf = tdf
else :
Rdf = pd.concat([Rdf,tdf])
Rdf.index = range(len(X))
probability_df = Rdf.copy()
return ( probability_df )
import impetuous.quantification as impq
if __name__ == '__main__' :
#
# IF YOU REQUIRE THE DATA THEN LOOK IN :
# https://github.com/richardtjornhammar/RichTools
# WHERE YOU CAN FIND THE FILES USED HERE
#
if True :
print( D(1),D(2),D(3),D(4) )
print( D(1,True),D(2,True),D(3,True),D(4,True) )
colors = {'H':'#777777','C':'#00FF00','N':'#FF00FF','O':'#FF0000','P':'#FAFAFA'}
Q = read_xyz( name='data/naj.xyz' , header=2 , sep=' ' )
P = read_xyz( name='data/cluster0.xyz' , header=2 , sep='\t' ) # IF IT FAILS THEN CHECK THE FORMATING OF THIS FILE
if True :
labels , centroids = impq.knn_clustering_alignment ( P , Q )
pd.DataFrame( labels ) .to_csv( 'labels.csv' ,'\t' )
pd.DataFrame( centroids).to_csv( 'centroids.csv','\t' )
else :
labels = pd.read_csv('labels.csv','\t',index_col=0).values
centroids = pd.read_csv('centroids.csv','\t',index_col=0).values
p = cluster_probability_of_x ( X = np.array( [ [-20.5, 24.6 , 84] , [10,10,10] ] ) ,
evidence = P.values ,
labels = labels.reshape(-1,1) ,
centroids = centroids ,
K = None )
print ( p )
此差异已折叠。
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
#
# MOVED OVER FROM RANKOR (reducer.py) AS OF COMMIT :
# https://github.com/richardtjornhammar/rankor/commit/f04a83091a92130b9b73439b4ad65b5be3056cf9
#
import pandas as pd
import numpy as np
from scipy.stats import rankdata
from impetuous.quantification import qvalues, permuter
#
# REGULAR CONTRAST
contrast = lambda A,B : ( A-B )/( A+B )
#
# OTHER
e_flatness = lambda x : np.exp(np.mean(np.log(x),0))/np.mean(x,0)
e_contrast = lambda x : 1 - e_flatness(x)
pi0 = lambda pvs : 1.
def padded_rolling_average( tv , tau ) :
# AS OF THIS PANDAS VERSION ( 1.1.0 )
# WINDOW CALCULATION WAS NOT PADDED SO
# THIS IS A NUMPY ONLY IMPLEMENTATION
if tau==1 :
return ( tv )
if len(tv)<tau :
return ( [ np.mean(v_) for v_ in tv ] )
centered = lambda x:(x[0],x[1]) ; N=len(tv);
w = int(np.floor(np.abs(tau)*0.5)) ;
jid = lambda i,w,N:[int((i-w)>0)*((i-w)%N),int(i+w<N)*((i+w)%N)+int(i+w>=N)*(N-1)]
idx = [ centered( jid(i,w,N) ) for i in range(N) ]
mvalues = [ np.mean(tv[i[0]:i[1]]) for i in idx ]
return ( mvalues )
def svd_reduced_mean ( x,axis=0,keep=[0] ) :
if True :
sk = set ( keep )
if len ( np.shape(x) ) > 1 :
u , s , vt = np .linalg .svd( x , full_matrices=False )
xred = np.mean( np.dot(u*[s[i_] if i_ in sk else 0 for i_ in range(len(s))],vt) , axis)
if 'pandas' in str(type(x)) :
if not 'series' in str(type(x)) :
xname = x.index.values[0]
return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) )
else :
xname = x.name
return ( pd.Series( xred , name=xname , index=x.columns ) )
else :
return ( xred )
return ( x )
from sklearn.decomposition import PCA
dimred = PCA ( n_components = 1 )
def pca_reduced_mean( x ) :
if True :
if len ( np.shape(x) ) > 1 :
Xnew = dimred.fit_transform( x.T )
xred = Xnew . T [0] + np.mean(np.mean(x))
if 'pandas' in str(type(x)) :
if not 'series' in str(type(x)) :
xname = x.index.values[0]
return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) )
else :
xname = x.name
return ( pd.Series( xred , name=xname , index=x.columns ) )
return ( x )
def reduction ( a , power , centered=-1 ) :
if centered>0 :
a = ( a.T-np.mean(a,1) ).T
return( np.linalg.svd ( a**power , full_matrices=False ) )
def hyper_params ( df_ , label = 'generic' , sep = ',' , power=1., centered=-1 ):
#
idx_ = df_.index.values
N_s = len ( df_.columns )
u,s,vt = reduction( df_.values , power , centered=centered )
rdf_ = pd.Series ( np.sum(u**2,1) , index=idx_ , name = label+sep+"u" )
rdf_ = pd.concat ( [ pd.DataFrame(rdf_) ,
pd.DataFrame( pd.Series( np.mean( df_.values,1 ) ,
index = idx_ , name=label+sep+"m") ) ] , axis=1 )
w_ = rdf_ .loc[ :,label+sep+"u" ].values
r_ = rankdata ( [ w for w in w_ ] ,'average' )
N = len ( r_ )
#
df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ ).T
#
from scipy.special import erf as erf_
loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ]
lvs = np.log( df0_.loc[ :,'w_'].values )
#
return ( np.mean(lvs) , np.std(lvs) )
def hyper_rdf ( df_ , label = 'generic' , sep = ',' , power=1. ,
diagnostic_output = False , MEAN=None , STD=None , centered=-1 ) :
#
idx_= df_.index.values
N_s = len ( df_.columns )
u,s,vt = reduction ( df_.values , power , centered=centered )
rdf_ = pd.Series ( np.sum( ( u )**2,1 ) , index=idx_ , name = label+sep+"u" )
rdf_ = pd.concat ( [ pd.DataFrame( rdf_ ) ,
pd.DataFrame( pd.Series( np.mean( df_.values,1 ) ,
index = idx_ , name=label+sep+"m") ) ] , axis=1 )
w_ = rdf_ .loc[ :,label+sep+"u" ].values
r_ = rankdata ( [ w for w in w_] ,'average' )
N = len ( r_ )
#
# HERE WE CONSTRUCT A DIANGOSTICS AND INTERMITTENT CALCULATION
# DATAFRAME FOR EASIER DEBUGGING AND INSPECTION
df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ )
df0_ = df0_.T.sort_values ( by='r_' )
df0_ .loc[ : , 'd_' ] = [ v for v in ( df0_.loc[:, 'w_' ] * df0_.loc[:,'r_'] ) ]
df0_ .loc[ : , 'da_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values )
#
# HOW MANY EQUALLY OR MORE EXTREME VALUES ARE THERE? ( RANK BASED )
df0_ .loc[ : , 'dt_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values[::-1] )[::-1]
df0_ .loc[ : , 'rank_ps' ] = df0_ .loc[ :,'dt_' ] / np.sum( df0_ .loc[ :,'d_' ] )
#
# DRAW HISTOGRAM TO SEE DISTRIBUTION OF THE DISTANCES
from scipy.special import erf as erf_
loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ]
lvs = np.log( df0_.loc[ :,'w_'].values )
if MEAN is None or STD is None:
if len(lvs)>3 :
ps = loc_pval( lvs , np.mean( lvs ) , np.std( lvs ) )
else :
ps = df0_ .loc[ : , 'rank_ps' ]
else :
ps = loc_pval( lvs , MEAN , STD )
#
if diagnostic_output :
import scipy.stats as scs
NB = 100
lv = np.log( rdf_.loc[:,[label+sep+"u"]].values )
y , x = np.histogram( lv , bins=NB , density=True )
skw = scs.skew(lv)[0]
kur = scs.kurtosis(lv)[0]
shape_stats = "kurtosis: " + "{:.2f} ".format( kur ) + "skewness: "+ "{:.2f}".format( skw )
locd = lambda x,M,s : (1./s/np.sqrt(2.*np.pi))*np.exp(-0.5*((x-M)/s)**2 )
lin_x = 0.5 * ( x[1:] + x[:-1] )
his_y = y
rsy = sorted( [ (y_,x_) for (y_,x_) in zip(y,lin_x) ] )[::-1]
hm , hs = np.mean([rsy[i][1] for i in range(5)]) , np.mean([rsy[i][0] for i in range(5)])
h_mod_y = locd( lin_x , hm , 1.0/(hs*np.sqrt(2.*np.pi)) )
d_mod_y = locd( lv , np.mean(lv) , np.std(lv) )
rem_y = [ (y_-m_) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ]
prc_y = [ 100.*np.abs( contrast(y_,m_) ) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ]
RMSD = np.sqrt(np.sum([ ry**2 for ry in rem_y ]))
PMAE = np.mean(prc_y)
#
df0_ .loc[ : , 'pvalues' ] = ps
#
# ASSIGN THE P VALUES
rdf_.loc[df0_.index.values,label+sep+"p"] = df0_.loc[:,'pvalues']
rdf_.loc[df0_.index.values,label+sep+"q"] = [ qvs[0] for qvs in qvalues( df0_.loc[:,'pvalues'].values , pi0 = pi0(df0_.loc[:,'pvalues'].values) ) ]
rdf_.loc[df0_.index.values,label+sep+"r"] = df0_ .loc[ : , 'rank_ps' ]
#
# AND RETURN THE RESULTS
if diagnostic_output :
return ( rdf_ , RMSD , PMAE , kur , skw , rem_y )
else :
return ( rdf_ )
if __name__ == '__main__' :
#
print ( 'REDUCER :: TESTS ' )
#
a = 2*np.random.rand(10)
b = 4*np.random.rand(10)
X = [ [*(a[:5]+1),*a[5:]],[*(b[:5]+3),*(b[5:])] ]
Xdf = pd.DataFrame( X , columns=['a','b','s','c','d','e','f','g','h','i'] , index=['i0','i1'])
#
print ( Xdf )
print ( svd_reduced_mean ( X ) )
print ( svd_reduced_mean ( Xdf ) )
v0 = [1.9,1.8,2.1,1.1,8.,1.2,2.2,3.5,2.0,2.0,3.1,2.1,2.9]
a2 = np.array([[2,2,2,1,1,1,2,3,2,2,3,2,3],v0])
NW = 100
if False:
for i in range(NW):
for w in range(1,i-1):
dat = np.random.rand(i)
pra = padded_rolling_average( dat,tau=w )
str_pra = ''.join([ str(p) for p in pra ])
if 'nan' in str_pra:
print ( pra,len(pra),'[',i,w ,']')
else:
if not len(pra)==len(dat):
print ( len(pra)==len(dat),'[',i,w ,']',len(pra),len(dat) )
print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(-1,1) ,tau=4 ) )
print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(1,-1) ,tau=4 ) )
print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ) ,tau=4 ) )
print ( padded_rolling_average( v0,tau=4 ) )
print ( v0 )
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
from scipy.stats import rankdata
beta2M = lambda b : math.log2( (b/(1-b)) )
M2beta = lambda M : 2**M/(2**M+1)
#
# FRACTIONAL RANKS ON [0,1] ARE RECAST TO +- INFINITY
map_df_to_spectral_order_domain = lambda df: \
df .apply ( lambda x:(rankdata(x,'average')-0.5)/len(x) ) \
.apply ( lambda B:[ beta2M(b) for b in B] )
power = lambda s : np.dot( s.T,np.conj(s) )
#
# THE SPECTRUM IS TRANSFORMED INTO A POWER SPECTRUM
spectre_to_power = lambda sdf: \
pd.DataFrame ( \
power ( sdf.T.apply(np.fft.fft) ) , \
index = sdf.index , columns = sdf.index \
)
#
# THE BELOW METHOD IS HIGHLY EXPERIMENTAL
def transform_to_resonances( spectrum ) :
# assumes spectrum is a square symmetric matrix
f_ls = np.mean( np.abs(spectrum.iloc[0,:].quantile([0.01,0.99])) )
reso = spectrum .apply(lambda ser: np.real(ser.to_numpy())) \
.apply(lambda X:[x/f_ls for x in X]) .apply(M2beta) \
.apply(lambda X:[ 2.*x-1. for x in X] )
return ( reso )
#
# TIME AUTOCORRELATIONS ARE NOT THE SAME THING
# AS PEARSON AUTOCORRELATIONS
def calc_autocorrelation( tv , dt=1 ,bMeanCentered=True ) :
# If you studied statistical mechanics you would
# know about the Wiener Kinchin theorem
if bMeanCentered :
# So for stocks you might want
# to remove the mean
tv-=np.mean(tv)
ftv = np.fft.fft(tv)
rt = np.fft.ifft(ftv*np.conj(ftv))
rt = rt/rt[0]
rt = rt[:int(np.floor(len(rt)*0.5))]
return( [dt*t for t in range(len(rt))], [np.real(r) for r in rt] )
if __name__ == '__main__' :
print ( 'SORRY: NO TESTS HERE' )
print ( 'DEVELOPMENTAL VERSION')
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import numpy as np
import pandas as pd
import bokeh
description="""Rudimentary plotting wrappers for impetuous using bokeh"""
run_in_notebook="""
from bokeh.plotting import output_notebook, show
output_notebook()"""
typecheck = lambda x,typ: typ in str(type(x))
list_typecheck = lambda xl,typ, func: func( [ typecheck(x,typ) for x in xl ] )
nice_colors = list( set( [ '#c5c8c6' , '#1d1f21' , '#282a2e' , '#373b41' , '#a54242' , '#cc6666' ,
'#8c9440' , '#b5bd68' , '#de935f' , '#f0c674' , '#5f819d' , '#81a2be' ,
'#85678f' , '#b294bb' , '#5e8d87' , '#8abeb7' , '#707880' ] ) )
make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
def bscatter ( X , Y , additional_dictionary=None , title='' , color='#ff0000' , p=None, legend_label = None , alpha=1 , axis_labels = None ) :
from bokeh.plotting import figure, output_file, ColumnDataSource
from bokeh.models import HoverTool, Range1d, Text
from bokeh.models import Arrow, OpenHead, NormalHead, VeeHead, Line
#
if 'str' in str(type(color)):
colors_ = [ color for v in X ]
else :
colors_ = color
if 'list' in str(type(alpha)):
alphas_ = alpha
else :
alphas_ = [ alpha for v in X ]
data = { **{'x' : X , 'y' : Y ,
'color': colors_ ,
'alpha': alphas_ } }
ttips = [ ("index " , "$index" ) ,
("(x,y) " , "(@x, @y)" ) ]
if not additional_dictionary is None :
if 'dict' in str(type(additional_dictionary)):
data = {**data , **additional_dictionary }
for key in additional_dictionary.keys() :
ttips.append( ( str(key) , '@'+str(key) ))
source = ColumnDataSource ( data = data )
hover = HoverTool ( tooltips = ttips )
#
if p is None :
p = figure ( plot_width=600 , plot_height=600 ,
tools = [hover,'box_zoom','wheel_zoom','pan','reset','save'],
title = title )
if legend_label is None :
p.circle( 'x' , 'y' , size=12, source=source , color='color', alpha='alpha' )
else :
p.circle( 'x' , 'y' , size=12, source=source , color='color', alpha='alpha' , legend_label=legend_label )
p.xaxis.axis_label = axis_labels[ 0] if not axis_labels is None else 'x'
p.yaxis.axis_label = axis_labels[-1] if not axis_labels is None else 'y'
p.output_backend = 'webgl'
return( p )
def plotter ( x = np.random.rand(10) , y = np.random.rand(10) , colors = '#ff0000' ,
legends=None, axis_labels = None, bSave = False, name='scatter.html' ):
from bokeh.plotting import output_file, show, save
output_file( name )
outp = lambda x: save if bSave else show
if list_typecheck([x,y,colors],'list',all) :
p = bscatter( [0] , [0] , color = "#ffffff" , alpha=0 )
for i in range(len(x)) :
x_ , y_ , color = x[i] , y[i] , colors[i]
if list_typecheck([legends,axis_labels],'list',all):
label = legends[i]
p = bscatter( x_ , y_ , color = color , p = p , legend_label = label , axis_labels=axis_labels )
else :
p = bscatter( x_ , y_ , color = color , p = p )
outp ( p )
else :
p = bscatter( x,y, color=colors )
outp ( p )
return( p )
Metadata-Version: 2.1
Name: impetuous-gfa
Version: 0.36.3
Summary: Impetuous Quantification, a Statistical Learning library for Humans : Alignments, Clustering, Enrichments and Group Analysis
Home-page: https://github.com/richardtjornhammar/impetuous
Author: Richard Tjörnhammar
Author-email: richard.tjornhammar@gmail.com
License: UNKNOWN
Description: # A Statistical Learning library for Humans
Decomposes a set of expressions into a group expression. The toolkit currently offers enrichment analysis, hierarchical enrichment analysis, PLS regression, Shape alignment or clustering as well as rudimentary factor analysis.
The expression regulation can be studied via a statistical test that relates it to the observables in the journal file. The final p values are then FDR corrected and the resulting adjusted p values are produced.
Visit the active code via :
https://github.com/richardtjornhammar/impetuous
Visit the published code :
https://doi.org/10.5281/zenodo.2594690
Cite using :
DOI: 10.5281/zenodo.2594690
# Pip installation with :
```
pip install impetuous-gfa
```
# Version controlled installation of the Impetuous library
The Impetuous library
In order to run these code snippets we recommend that you download the nix package manager. Nix package manager links from Oktober 2020:
https://nixos.org/download.html
```
$ curl -L https://nixos.org/nix/install | sh
```
If you cannot install it using your Wintendo then please consider installing Windows Subsystem for Linux first:
```
https://docs.microsoft.com/en-us/windows/wsl/install-win10
```
In order to run the code in this notebook you must enter a sensible working environment. Don't worry! We have created one for you. It's version controlled against python3.7 and you can get the file here:
https://github.com/richardtjornhammar/rixcfgs/blob/master/code/environments/impetuous-shell.nix
Since you have installed Nix as well as WSL, or use a Linux (NixOS) or bsd like system, you should be able to execute the following command in a termnial:
```
$ nix-shell impetuous-shell.nix
```
Now you should be able to start your jupyter notebook locally:
```
$ jupyter-notebook impetuous_finance.ipynb
```
and that's it.
# Usage example 1 : elaborate informatics example
code: https://gitlab.com/stochasticdynamics/eplsmta-experiments
docs: https://arxiv.org/pdf/2001.06544.pdf
# Usage example 2 : simple code example
Now while in a good environment: In your Jupyter notebook or just in a dedicated file.py you can write the following:
```
import pandas as pd
import numpy as np
import impetuous.quantification as impq
adf = pd.read_csv( 'analytes.csv' , '\t' , index_col=0 )
jdf = pd.read_csv( 'journal.csv' , '\t' , index_col=0 )
res_dfs = impq.run_rpls_regression ( adf , jdf , 'S ~ C(industry)' , owner_by = 'angle' )
print ( res_dfs )
```
# Manually updated code backups for this library :
GitLab: https://gitlab.com/richardtjornhammar/impetuous
CSDN: https://codechina.csdn.net/m0_52121311/impetuous
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Description-Content-Type: text/markdown
README.md
setup.py
impetuous_gfa.egg-info/PKG-INFO
impetuous_gfa.egg-info/SOURCES.txt
impetuous_gfa.egg-info/dependency_links.txt
impetuous_gfa.egg-info/top_level.txt
src/impetuous/__init__.py
src/impetuous/clustering.py
src/impetuous/convert.py
src/impetuous/fit.py
src/impetuous/hierarchal.py
src/impetuous/impetuous.py
src/impetuous/pathways.py
src/impetuous/probabilistic.py
src/impetuous/quantification.py
src/impetuous/reducer.py
src/impetuous/spectral.py
src/impetuous/visualisation.py
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册