首页 > 解决方案 > R 问题:如何模拟此图以显示健康和疾病中细胞的百分比变化?

问题描述

我有兴趣展示细胞的比例如何从健康变为疾病。我想展示从健康到疾病的“流程”,而不仅仅是两个单独的堆叠条形图,但我不确定这种类型的可视化是否有名称,而且我无法在网上找到很多示例。我想在 R 中执行此操作。它几乎是 sankey 图和和弦图的混合体。

我希望你们中的一些人对我可以使用哪些包在 R 中实现这一点有一些想法。

我用付费软件做的例子

标签: rvisualization

解决方案


正如 Ian Campbell 在评论中指出的那样,这被称为冲积地块,你可能会非常接近这个ggalluvial包。但是,可以使用 justgeom_ribbongeom_textfrom获得几乎相同的情节再现ggplot2

在此处输入图像描述

但是,这有点棘手。首先,我们需要一种方法来生成从一侧到另一侧的漂亮平滑曲线。以下函数采用开始和结束级别(作为 0 和 1 之间的数字)。它还允许可选地增加或减少任一侧列的宽度:

ribbon_line <- function(p1, p2, width = 10, len = 100)
{
  if (width > 50) width <- 50
  if (width < 0) width <- 0
  if (p1 < 0) p1 <- 0
  if (p1 > 1) p1 <- 1
  if (p2 < 0) p2 <- 0
  if (p2 > 1) p2 <- 1

  yvals <- c(p1, p1, pnorm(seq(-2.5, 2.5, length.out = len)) * (p2 - p1) + p1, p2, p2)
  xvals <- c(0, seq(width, 100 - width, length.out = len + 2), 100)
  list(x = xvals, y = yvals)
}

现在我们需要一种将两条线组合成一个数据框的方法,我们可以绘制坐标:

ribbon_df <- function(uppers, lowers, group, width = 10)
{
  data.frame(x    = ribbon_line(uppers[1], uppers[2], width)$x,
             ymax = ribbon_line(uppers[1], uppers[2], width)$y,
             ymin = ribbon_line(lowers[1], lowers[2], width)$y,
             group = group, stringsAsFactors = FALSE)
}

接下来,我们需要一种方法来获取一个简单的输入并将其变成一组这些功能区,加上左右列,加上文本标签:

multi_ribbons <- function(left_bottom, right_bottom, left_top, right_top,
                          groups, width = 10)
{
  if (length(left_bottom) != length(right_bottom) |
      length(left_bottom) != length(left_top)     |
      length(left_top)    != length(right_top)) 
    stop("Left and right columns different length")
  if (length(groups) != length(left_bottom))
    stop("Group length has to be same length as columns")

  d <- lapply(seq_along(groups), function(i) {
    ribbon_df(c(left_top[i], right_top[i]),
              c(left_bottom[i], right_bottom[i]),
              groups[i], width)})

  left_cols <- lapply(d, function(x) x[1:2,])
  right_cols <- lapply(d, function(x) x[nrow(x) - 1:0,])

  res <- list( left = do.call(rbind, left_cols),
               right = do.call(rbind, right_cols),
               bands = do.call(rbind, d))

  text_y <- c((res$left$ymax + res$left$ymin)/2,
             (res$right$ymax + res$right$ymin)/2)
  text_x <- c(rep(width / 2, length(res$left$x)), 
              rep(100 - width/2, length(res$left$x)))
  text_labels <- paste0(round(c(res$left$ymax - res$left$ymin,
                         res$right$ymax - res$right$ymin), 3) * 100, "%")
  res$text <- data.frame(x = text_x, y = text_y, labels = text_labels)
  res
}

最后,我们想要一种将我们的数据作为一对简单的因子向量并使用上述函数绘制它们的方法:

alluvial <- function(yvar, xvar, width = 20)
{
  tab <- table(yvar, xvar)
  x_labs <- rownames(tab)
  y_labs <- colnames(tab)
  left <- tab[1,]/sum(tab[1,])
  left <- cumsum(sort(left))
  right <- tab[2,]/sum(tab[2,])
  right <- cumsum(sort(right))
  left_lower <- c(0, left[-length(left)])
  names(left_lower) <- names(left)
  right_lower <- c(0, right[-length(right)])
  names(right_lower) <- names(right)
  right <- right[match(names(left), names(right))]
  right_lower <- right_lower[match(names(left), names(right_lower))]
  df_list <- multi_ribbons(left_lower, right_lower, left, right, 
                           names(left), width = 20)
  ggplot(df_list$bands, aes(x = x, ymin = ymin, ymax = ymax, fill = group)) + 
    geom_ribbon(alpha = 0.5) +
    geom_ribbon(alpha = 1, data = df_list$left) +
    geom_ribbon(alpha = 1, data = df_list$right) +
    geom_text(data = df_list$text, inherit.aes = FALSE, colour = "white",
              aes(x = x, y = y, label = labels), size = 8) +
    geom_text(data = data.frame(x = c(width / 2, 100 - width /2), y = c(1.05, 1.05),
                         labels = factor(x_labs, levels = x_labs)),
              inherit.aes = FALSE,
              mapping = aes(x = x, y = y, label = labels), size = 12) +
    geom_text(data = data.frame(x = rep(-5, length(y_labs)), 
                                y = unique(df_list$text$y[1:(nrow(df_list$text)/2)]), 
                                labs = unique(df_list$bands$group)),
              mapping = aes(x = x, y = y, colour = labs, label = labs),
              inherit.aes = FALSE, size = 8, hjust = 1) +
    scale_fill_manual(values = c("#e64b35", "#806249", "#00a087", "#3c5488")) +
    scale_colour_manual(values = c("#e64b35", "#806249", "#00a087", "#3c5488")) +
    coord_cartesian(xlim = c(-15, 101)) +
    theme_void() + theme(legend.position = "none")
}

所以,如果我们你的数据框是这样的格式:

head(df, 20)
#>     condition    variable
#> 110   Disease      Immune
#> 149   Disease  Fibroblast
#> 133   Disease      Immune
#> 184   Disease Endothelial
#> 137   Disease      Immune
#> 200   Disease Endothelial
#> 30     Health      Immune
#> 11     Health      Immune
#> 63     Health  Fibroblast
#> 88     Health Endothelial
#> 42     Health  Fibroblast
#> 38     Health  Fibroblast
#> 106   Disease      Immune
#> 139   Disease      Immune
#> 6      Health  Epithelial
#> 21     Health      Immune
#> 27     Health      Immune
#> 181   Disease Endothelial
#> 95     Health Endothelial
#> 108   Disease      Immune

你可以这样做:

alluvial(df$condition, df$variable)

为了得到上面的情节,或者,为了更随机的东西:

set.seed(69)
alluvial(sample(c(TRUE, FALSE), 200, replace = TRUE), 
         sample(LETTERS[1:4], 200, replace = TRUE))

在此处输入图像描述

如果您想要四个以上的颜色或填充级别,您可以删除或调整scale_colour_manualandscale_fill_manual调用,例如:

set.seed(69)
alluvial(sample(c(TRUE, FALSE), 200, replace = TRUE), 
         sample(LETTERS[1:20], 200, replace = TRUE))

在此处输入图像描述


推荐阅读