From 006f34e19eb5afd48c270805b8142e1c11a9831c Mon Sep 17 00:00:00 2001 From: wizardforcel <562826179@qq.com> Date: Sat, 20 Jan 2018 00:08:51 +0800 Subject: [PATCH] ch16. --- 16.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/16.md b/16.md index f9aae37..26cd029 100644 --- a/16.md +++ b/16.md @@ -353,3 +353,51 @@ Observed TVD: 0.418419465491 ### 两个类别分布的相等性的排列检验 +我们上面所做的检验被称为原假设的排列检验,即两个样本是从相同的底层分布中抽取的。 + +为了定义一个执行检验的函数,我们可以复制前一个单元格的代码,并更改表和列的名称。函数`permutation_test_tvd`接受数据表的名称,包含类别变量的列标签,它的分布要检验,包含二元类别变量的列标签,以及要运行的随机排列的数量。 + +在我们上面的例子中,我们没有计算 P 值,因为观测值远离原假设下统计量的分布。但是,一般来说,我们应该计算 P 值,因为在其他例子中统计量可能不是那么极端。 P 值是“假设原假设为真,所得距离大于等于观测距离”的几率,因为备选假设比原假设预测了更大的距离。 + +```py +def permutation_test_tvd(table, variable, classes, repetitions): + + """Test whether a categorical variable is independent of classes: + table: name of table containing the sample + variable: label of column containing categorical variable whose distribution is of interest + classes: label of column containing binary class data + repetitions: number of random permutations""" + + # Find the tvd between the distributions of variable in the two classes + counts = table.select(classes, variable).pivot(classes, variable) + observed_tvd = tvd(proportions(counts.column(1)), proportions(counts.column(2))) + + # Assuming the null is true, randomly permute the variable and collect all the new tvd's + tvds = make_array() + for i in np.arange(repetitions): + shuffled_var = table.select(variable).sample(with_replacement=False).column(0) + shuffled = table.select(classes).with_column('Shuffled Variable', shuffled_var) + shuffled_counts = shuffled.pivot(classes, 'Shuffled Variable') + new_tvd =tvd(proportions(shuffled_counts.column(1)), proportions(shuffled_counts.column(2))) + tvds = np.append(tvds, new_tvd) + + # Find the empirical P-value: + emp_p = np.count_nonzero(tvds >= observed_tvd)/repetitions + + # Draw the empirical histogram of the tvd's generated under the null, + # and compare with the value observed in the original sample + Table().with_column('TVD', tvds).hist(bins=20) + plots.title('Empirical Distribution Under the Null') + print('Observed TVD:', observed_tvd) + print('Empirical P-value:', emp_p) +permutation_test_tvd(patients, 'Clump Thickness', 'Class', 5000) +Observed TVD: 0.638310905047 +Empirical P-value: 0.0 +``` + +同样,观测距离 0.64 离原假设预测的分布很远。 经验 P 值为 0,所以准确的 P 值将接近于零。 因此,如果类别和有丝分裂评分是不相关的,那么观测的数据是极不可能的。 + +所以得出的结论是,有丝分裂评分与类别有关,不仅在样本中,而且在总体中。 + +我们使用排列检验来帮助我们确定,类别属性的分布是否与类别相关。 一般来说,排列检验可以这样使用来确定,两个类别分布是否从相同的基本分布随机抽样。 + -- GitLab