首页 > 解决方案 > R:用于正态性检验的 Kolmogorov-Smirnov 检验可视化(经验 CDF 与理论 CDF 的比较)

问题描述

我想使用 Kolmogorov-Smirnov 测试 (ks-test) 检查正常性并将其可视化以支持我的演示。

我使用了这个默认代码

sample1 <- rnorm(10000, 10, 5)
sample2 <- rnorm(10000, 1, 5)
group <- c(rep("sample1", length(sample1)), rep("sample2", length(sample2)))
dat <- data.frame(KSD = c(sample1,sample2), group = group)
# create ECDF of data
cdf1 <- ecdf(sample1) 
cdf2 <- ecdf(sample2) 
# find min and max statistics to draw line between points of greatest distance
minMax <- seq(min(sample1, sample2), max(sample1, sample2), length.out=length(sample1)) 
x0 <- minMax[which( abs(cdf1(minMax) - cdf2(minMax)) == max(abs(cdf1(minMax) - cdf2(minMax))) )] 
y0 <- cdf1(x0) 
y1 <- cdf2(x0) 

ggplot(dat, aes(x = KSD, group = group, color = group))+
  stat_ecdf(size=1) +
  theme_bw(base_size = 28) +
  theme(legend.position ="top") +
  xlab("Sample") +
  ylab("ECDF") +
  #geom_line(size=1) +
  geom_segment(aes(x = x0[1], y = y0[1], xend = x0[1], yend = y1[1]),
               linetype = "dashed", color = "red") +
  geom_point(aes(x = x0[1] , y= y0[1]), color="red", size=8) +
  geom_point(aes(x = x0[1] , y= y1[1]), color="red", size=8) +
  ggtitle("K-S Test: Sample 1 / Sample 2") +
  theme(legend.title=element_blank())

结果是这样的: 在此处输入图像描述

现在,我想用我自己的数据替换它,

mydata <- c(33, 28, 23, 10, 27, 22, 25, 32, 26, 14, 37, 16, 30, 34, 26, 27, 18, 22, 23, 37, 16, 26, 31, 20, 25, 34, 24, 12, 28, 27, 40, 25, 23, 32, 23, 32, 17, 27, 18, 28, 17, 29, 12, 36, 32, 25, 32, 23, 16, 29, 18, 18, 46, 33, 30, 22, 21, 30, 35, 25, 13, 17, 28, 24, 32, 26, 10, 35, 23, 28, 19, 13, 24, 30, 20, 19, 26, 36, 28, 32, 28, 29, 32, 13, 32, 30, 36, 25, 20, 32, 18, 30, 27, 32, 23, 33, 30, 31, 20, 35, 29, 24, 35, 21, 18, 16, 23, 29, 26, 14, 13, 29, 24, 30, 13, 28, 24, 19, 28, 16, 25, 28, 30, 32, 14, 10, 12, 39, 16, 20, 20, 12, 30, 17, 28, 28, 37, 22, 16, 21, 32, 20, 25, 19, 37, 26, 13, 22, 18, 18, 16, 30, 16, 34, 11, 16, 14, 32, 35, 16, 17, 29, 17, 24, 28, 15, 21, 24, 18, 20, 16, 24, 20, 22, 30, 24, 27, 26, 42, 22, 35, 24, 19, 10, 34, 19, 18, 30, 19, 13, 17, 27, 28, 36, 23, 23, 25, 19, 12, 30, 15, 38, 15, 36, 19, 28, 27, 12, 31, 18, 26, 15, 34, 29, 32, 19, 14, 34, 24, 11, 23, 29, 15, 23, 16, 28, 11, 37, 23, 31, 14, 32, 19, 27, 29, 10, 19, 12, 19, 17, 34, 24, 26, 17, 15, 28, 18, 14, 24, 22, 13, 10, 23, 22, 19, 31, 22, 12, 23, 27, 16, 13, 22, 34, 12, 30, 20, 30, 29, 25, 35, 19, 11, 27, 16, 22, 10, 35, 26, 14, 32, 16, 24, 15, 19, 18, 37, 24, 30, 16, 22, 38, 25, 32, 28, 21, 26, 23, 10, 19, 20, 23, 15, 25, 23, 14, 20, 33, 29, 14, 11, 13, 28, 17, 24, 19, 12, 35, 30, 25, 15, 25, 24, 37, 17, 25, 11, 24, 18, 19, 27, 29, 15, 29, 12, 10, 22, 14, 18, 15, 15, 15, 23, 15, 23, 27, 21, 39, 13, 23, 13, 26, 20, 22, 20, 26, 37, 21, 20, 17, 34, 30, 18, 30, 34, 27, 10, 29, 10, 18, 18, 12, 19, 27, 20, 26, 27, 13, 23, 24, 10, 14, 22, 17, 19, 15, 24, 15, 16, 21, 24, 27, 17, 14, 29, 22, 26, 16, 17, 15, 28, 20, 32, 18, 25, 24, 15, 10, 25, 35, 30, 10, 34, 20, 36, 25, 15, 35, 22, 22, 12, 23, 20, 12, 27, 16, 30, 28, 12, 19, 15, 25, 12, 26, 21, 13, 19, 23, 31, 12, 11, 16, 12, 11, 33, 30, 30, 20, 23, 30, 19, 32, 21, 32, 36, 18, 13, 24, 26, 21, 20, 19, 34, 42, 20, 19, 31, 12, 24, 21, 30, 13, 14, 24, 18, 17, 13, 22, 37, 24, 21, 17, 13, 18, 50, 16, 25, 19, 20, 14, 15, 18, 44, 29, 21, 33, 18, 25, 35, 29, 17, 27, 41, 22, 22, 32, 34, 33, 20, 31, 30, 27, 22, 14, 23, 19, 27, 17, 26, 12, 27, 24, 36, 19, 21, 23, 40, 29, 27)

我可以mydata用作经验 CDF,但如何更改sample2为理论 CDF?

目标:

提前致谢。

标签: rggplot2visualizationkolmogorov-smirnov

解决方案


R 中的理论累积概率函数由p*分布函数的版本给出。如果您想测试正态性,函数将是pnorm. 您将在下面找到一种绘制此图的方法。黑线表示最大差异点。

library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.0.5

mydata <- c(33, 28, 23, 10, 27, 22, 25, 32, 26, 14, 37, 16, 30, 34, 26, 27, 18, 22, 23, 37, 16, 26, 31, 20, 25, 34, 24, 12, 28, 27, 40, 25, 23, 32, 23, 32, 17, 27, 18, 28, 17, 29, 12, 36, 32, 25, 32, 23, 16, 29, 18, 18, 46, 33, 30, 22, 21, 30, 35, 25, 13, 17, 28, 24, 32, 26, 10, 35, 23, 28, 19, 13, 24, 30, 20, 19, 26, 36, 28, 32, 28, 29, 32, 13, 32, 30, 36, 25, 20, 32, 18, 30, 27, 32, 23, 33, 30, 31, 20, 35, 29, 24, 35, 21, 18, 16, 23, 29, 26, 14, 13, 29, 24, 30, 13, 28, 24, 19, 28, 16, 25, 28, 30, 32, 14, 10, 12, 39, 16, 20, 20, 12, 30, 17, 28, 28, 37, 22, 16, 21, 32, 20, 25, 19, 37, 26, 13, 22, 18, 18, 16, 30, 16, 34, 11, 16, 14, 32, 35, 16, 17, 29, 17, 24, 28, 15, 21, 24, 18, 20, 16, 24, 20, 22, 30, 24, 27, 26, 42, 22, 35, 24, 19, 10, 34, 19, 18, 30, 19, 13, 17, 27, 28, 36, 23, 23, 25, 19, 12, 30, 15, 38, 15, 36, 19, 28, 27, 12, 31, 18, 26, 15, 34, 29, 32, 19, 14, 34, 24, 11, 23, 29, 15, 23, 16, 28, 11, 37, 23, 31, 14, 32, 19, 27, 29, 10, 19, 12, 19, 17, 34, 24, 26, 17, 15, 28, 18, 14, 24, 22, 13, 10, 23, 22, 19, 31, 22, 12, 23, 27, 16, 13, 22, 34, 12, 30, 20, 30, 29, 25, 35, 19, 11, 27, 16, 22, 10, 35, 26, 14, 32, 16, 24, 15, 19, 18, 37, 24, 30, 16, 22, 38, 25, 32, 28, 21, 26, 23, 10, 19, 20, 23, 15, 25, 23, 14, 20, 33, 29, 14, 11, 13, 28, 17, 24, 19, 12, 35, 30, 25, 15, 25, 24, 37, 17, 25, 11, 24, 18, 19, 27, 29, 15, 29, 12, 10, 22, 14, 18, 15, 15, 15, 23, 15, 23, 27, 21, 39, 13, 23, 13, 26, 20, 22, 20, 26, 37, 21, 20, 17, 34, 30, 18, 30, 34, 27, 10, 29, 10, 18, 18, 12, 19, 27, 20, 26, 27, 13, 23, 24, 10, 14, 22, 17, 19, 15, 24, 15, 16, 21, 24, 27, 17, 14, 29, 22, 26, 16, 17, 15, 28, 20, 32, 18, 25, 24, 15, 10, 25, 35, 30, 10, 34, 20, 36, 25, 15, 35, 22, 22, 12, 23, 20, 12, 27, 16, 30, 28, 12, 19, 15, 25, 12, 26, 21, 13, 19, 23, 31, 12, 11, 16, 12, 11, 33, 30, 30, 20, 23, 30, 19, 32, 21, 32, 36, 18, 13, 24, 26, 21, 20, 19, 34, 42, 20, 19, 31, 12, 24, 21, 30, 13, 14, 24, 18, 17, 13, 22, 37, 24, 21, 17, 13, 18, 50, 16, 25, 19, 20, 14, 15, 18, 44, 29, 21, 33, 18, 25, 35, 29, 17, 27, 41, 22, 22, 32, 34, 33, 20, 31, 30, 27, 22, 14, 23, 19, 27, 17, 26, 12, 27, 24, 36, 19, 21, 23, 40, 29, 27)

# Find x position of maximum difference.
# You could precalculate `mean(mydata)` and `sd(mydata)` if you wish,
# we use it later in plotting too.
maxdiff <- mydata[which.max(
  abs(ecdf(mydata)(mydata) - pnorm(mydata, mean(mydata), sd = sd(mydata)))
)]

df <- data.frame(x = mydata)


ggplot(df, aes(x)) +
  stat_ecdf(aes(colour = "Empirical")) +
  stat_function(fun = pnorm, args = list(mean = mean(mydata), sd = sd(mydata)),
                aes(colour = "Theoretical")) +
  annotate(
    "segment",
    x = maxdiff, xend = maxdiff,
    y = ecdf(mydata)(maxdiff), 
    yend = pnorm(maxdiff, mean(mydata), sd(mydata)),
    size = 1
  )

reprex 包于 2021-04-25 创建(v1.0.0)


推荐阅读