首页 > 解决方案 > 唯一对(列)之间的共享变量在 R 中使用 widyr() 给出错误

问题描述

我正在尝试计算区域对之间共享了多少物种。因此,如果物种 A、B 和 C 都出现在美国和加拿大,但物种 D 只出现在加拿大,那么美国-加拿大对的共有物种数 == 3。我使用了 widyr( ) 包在 R 中对大型数据集(50 个国家/地区的 >10,000 个物种)执行此操作,但我遇到两个错误:

  1. 如果我分析整个数据集,则会丢失一些区域对,而如果我只是对这两个区域进行子集化,那么它就可以工作(给出该对的共享物种的数量)。2)我只尝试了区域的一个子集,与使用完整数据集相比,我得到了共享多少物种的不同答案。

我看过一些关于计算对之间距离和唯一对之间距离的类似帖子,但这并不是我想要做的。我以前从未使用过widyr,我无法弄清楚错误是什么 - 我想知道它是否与“第一个”和“最后一个”有关?

# sample.dat.sub dput below - it's long but I wasn't sure I coudl reproduce the problem with only a few rows

# sample.dat.sub has cols "species_id" and "rgn_id"

## get col names
spp_ids <- sample.dat.sub %>% distinct(species_id) %>% as.character()
rgn_ids <- sample.dat.sub %>% distinct(rgn_id) 
rgn_ids <- rgn_ids$rgn_id
rgn_ids <- as.character(rgn_ids)

## find pairs
pairs.df <- sample.dat.sub %>%
  group_by(species_id) %>%
  arrange(rgn_id) %>%
  # maybe this is the problem?
  mutate(rgn1 = first(rgn_id), rgn2 = last(rgn_id))  

result.df <- pairs.df %>%
  group_by(rgn1, rgn2) %>%
  # number of unique species that each region pair shares
  summarize(n_spp = n_distinct(species_id))

# THE PROBLEM:
result.df %>% filter(rgn1 == 134) # says 134-136 have 27 shared species, but if I just analyzed those 2 reigons alone, it says they share 75 species...

## test the sample

sample.test <- sample.dat.sub %>% filter(rgn_id %in% c(134,136))

# get col names
test.spp_ids <- sample.test %>% distinct(species_id) %>% as.character()
test.rgn_ids <- sample.test %>% distinct(rgn_id) 
test.rgn_ids <- test.rgn_ids$rgn_id
test.rgn_ids <- as.character(test.rgn_ids)

# pairs
test.pairs.df <- sample.test %>%
  group_by(species_id) %>%
  arrange(rgn_id) %>%
  mutate(rgn1 = first(rgn_id), rgn2 = last(rgn_id)) 
test.pairs.df 

test.result.df <- test.pairs.df %>%
  group_by(rgn1, rgn2) %>%
  summarize(n_spp = n_distinct(species_id))

test.result.df %>% arrange(desc(n_spp)) # now it says 75 species shared

## SAMPLE DATA

sample.dat.sub <- structure(list(species_id = c("Brc-378165", "Brc-378165", "Brc-378165", 
"Chn-0721ac0c-7d19-46fa-9dcc-02da54310867", "Chn-0721ac0c-7d19-46fa-9dcc-02da54310867", 
"Chn-0721ac0c-7d19-46fa-9dcc-02da54310867", "Chn-46f58c2c-1280-4f6a-a148-e854c19246fe", 
"Chn-46f58c2c-1280-4f6a-a148-e854c19246fe", "Chn-4a9f789f-84e4-4da2-b520-c754b804a303", 
"Chn-4a9f789f-84e4-4da2-b520-c754b804a303", "Chn-4a9f789f-84e4-4da2-b520-c754b804a303", 
"Chn-5b98a280-b9a0-435b-8ed9-87b3f63fc29b", "Chn-5b98a280-b9a0-435b-8ed9-87b3f63fc29b", 
"Chn-5b98a280-b9a0-435b-8ed9-87b3f63fc29b", "Chn-eff8756a-9d9f-4430-886b-4a0badfed55d", 
"Chn-eff8756a-9d9f-4430-886b-4a0badfed55d", "Chn-eff8756a-9d9f-4430-886b-4a0badfed55d", 
"Chn-f031838d-0a64-4288-8d25-04c0ab44ad67", "Chn-f41af29f-5e40-4cb4-8862-220e6e64b329", 
"Chn-f41af29f-5e40-4cb4-8862-220e6e64b329", "Chn-f41af29f-5e40-4cb4-8862-220e6e64b329", 
"Fis-10903", "Fis-10993", "Fis-10993", "Fis-10993", "Fis-112580", 
"Fis-112580", "Fis-112580", "Fis-112580", "Fis-112582", "Fis-112582", 
"Fis-112582", "Fis-112590", "Fis-112590", "Fis-112590", "Fis-112688", 
"Fis-112688", "Fis-112688", "Fis-112689", "Fis-112689", "Fis-112689", 
"Fis-112689", "Fis-112706", "Fis-112706", "Fis-112706", "Fis-112706", 
"Fis-112822", "Fis-112822", "Fis-112822", "Fis-113904", "Fis-113904", 
"Fis-114088", "Fis-114088", "Fis-114088", "Fis-114098", "Fis-114098", 
"Fis-114098", "Fis-114124", "Fis-114124", "Fis-114124", "Fis-114187", 
"Fis-11429", "Fis-11429", "Fis-11429", "Fis-114378", "Fis-114378", 
"Fis-114378", "Fis-114615", "Fis-114615", "Fis-114615", "Fis-114618", 
"Fis-114618", "Fis-114618", "Fis-114618", "Fis-114623", "Fis-114623", 
"Fis-114623", "Fis-114937", "Fis-114937", "Fis-114937", "Fis-114940", 
"Fis-114940", "Fis-114940", "Fis-115015", "Fis-115015", "Fis-115015", 
"Fis-115342", "Fis-115352", "Fis-115352", "Fis-115352", "Fis-115623", 
"Fis-115623", "Fis-115623", "Fis-115798", "Fis-115798", "Fis-115798", 
"Fis-115798", "Fis-115821", "Fis-115821", "Fis-115821", "Fis-115864", 
"Fis-115864", "Fis-115865", "Fis-115865", "Fis-115865", "Fis-115865", 
"Fis-115865", "Fis-115873", "Fis-115873", "Fis-115873", "Fis-115873", 
"Fis-115957", "Fis-115957", "Fis-115957", "Fis-115963", "Fis-115963", 
"Fis-115963", "Fis-115986", "Fis-115986", "Fis-115986", "Fis-115989", 
"Fis-115989", "Fis-115989", "Fis-116375", "Fis-116375", "Fis-116472", 
"Fis-116479", "Fis-116479", "Fis-116479", "Fis-116737", "Fis-116737", 
"Fis-116737", "Fis-116742", "Fis-116742", "Fis-116742", "Fis-116747", 
"Fis-116747", "Fis-116747", "Fis-116901", "Fis-116901", "Fis-116901", 
"Fis-116910", "Fis-116910", "Fis-116910", "Fis-116926", "Fis-116926", 
"Fis-117119", "Fis-117121", "Fis-117126", "Fis-117126", "Fis-117126", 
"Fis-117130", "Fis-117130", "Fis-117130", "Fis-117220", "Fis-117220", 
"Fis-117220", "Fis-117350", "Fis-117350", "Fis-117350", "Fis-117350", 
"Fis-117358", "Fis-117358", "Fis-117370", "Fis-117370", "Fis-117370", 
"Fis-117375", "Fis-117375", "Fis-117375", "Fis-117773", "Fis-117773", 
"Fis-117773", "Fis-117773", "Fis-118052", "Fis-118052", "Fis-118052", 
"Fis-118058", "Fis-118058", "Fis-118058", "Fis-118058", "Fis-118106", 
"Fis-118106", "Fis-118106", "Fis-118742", "Fis-118742", "Fis-120072", 
"Fis-120072", "Fis-120072", "Fis-120428", "Fis-120428", "Fis-120470", 
"Fis-120470", "Fis-120470", "Fis-120470", "Fis-120470", "Fis-120470", 
"Fis-120669", "Fis-120908", "Fis-120908", "Fis-120908", "Fis-122362", 
"Fis-122362", "Fis-122362", "Fis-123096", "Fis-123121", "Fis-123121", 
"Fis-123121", "Fis-123123", "Fis-123123", "Fis-123123", "Fis-123128", 
"Fis-123128", "Fis-123128", "Fis-123959", "Fis-123959", "Fis-123959", 
"Fis-123975", "Fis-124168", "Fis-124168", "Fis-124204", "Fis-124204", 
"Fis-124208", "Fis-124211", "Fis-124211", "Fis-124211", "Fis-124275", 
"Fis-124275", "Fis-124324", "Fis-124324", "Fis-124376", "Fis-124376", 
"Fis-124376", "Fis-124376", "Fis-124376", "Fis-124378", "Fis-124378", 
"Fis-125137", "Fis-125137", "Fis-125137", "Fis-125137", "Fis-125186", 
"Fis-125235", "Fis-125622", "Fis-125622", "Fis-125622", "Fis-125639", 
"Fis-125639", "Fis-125642", "Fis-125642", "Fis-125886", "Fis-125888", 
"Fis-125893", "Fis-125893", "Fis-125893", "Fis-126085", "Fis-126085", 
"Fis-126140", "Fis-126296", "Fis-126299", "Fis-126299", "Fis-126299", 
"Fis-126681", "Fis-126681", "Fis-126681", "Fis-126951", "Fis-127016", 
"Fis-127016", "Fis-127016", "Fis-127124", "Fis-127433", "Fis-127433", 
"Fis-127444", "Fis-127530", "Fis-127585", "Fis-127585", "Fis-128046", 
"Fis-128046", "Fis-128046", "Fis-128063", "Fis-128093", "Fis-128093", 
"Fis-128093", "Fis-128094", "Fis-128094", "Fis-128110", "Fis-128110", 
"Fis-128236", "Fis-128236", "Fis-128376", "Fis-128376", "Fis-128376", 
"Fis-128376", "Fis-128422", "Fis-128608", "Fis-128608", "Fis-128612", 
"Fis-128612", "Fis-130029", "Fis-130029", "Fis-130029", "Fis-130702", 
"Fis-130702", "Fis-130702", "Fis-130702", "Fis-130709", "Fis-130709", 
"Fis-130709", "Fis-130709", "Fis-130771", "Fis-130806", "Fis-130806", 
"Fis-130806", "Fis-130825", "Fis-130825", "Fis-130825", "Fis-130876", 
"Fis-130876", "Fis-130876", "Fis-130917", "Fis-130917", "Fis-130917", 
"Fis-130921", "Fis-130921", "Fis-131008", "Fis-131178", "Fis-131178", 
"Fis-131312", "Fis-131312", "Fis-131316", "Fis-131316", "Fis-131323", 
"Fis-131323", "Fis-131323", "Fis-131546", "Fis-131546", "Fis-131546", 
"Fis-131546", "Fis-131590", "Fis-131590", "Fis-131590", "Fis-131936", 
"Fis-132003", "Fis-132003", "Fis-132003", "Fis-132010", "Fis-132010", 
"Fis-132010", "Fis-132018", "Fis-132018", "Fis-132018", "Fis-132022", 
"Fis-132022", "Fis-132022", "Fis-132037", "Fis-132037", "Fis-132037", 
"Fis-132059", "Fis-132059", "Fis-132059", "Fis-132062", "Fis-132062", 
"Fis-132062", "Fis-132062", "Fis-132064", "Fis-132064", "Fis-132064", 
"Fis-132076", "Fis-132076", "Fis-132076", "Fis-132087", "Fis-132087", 
"Fis-132142", "Fis-132142", "Fis-132153", "Fis-132153", "Fis-132153", 
"Fis-132157", "Fis-132157", "Fis-132159", "Fis-132159", "Fis-132159", 
"Fis-132167", "Fis-132167", "Fis-132167", "Fis-132169", "Fis-132169", 
"Fis-132169", "Fis-132243", "Fis-132243", "Fis-132243", "Fis-132269", 
"Fis-132269", "Fis-132269", "Fis-132395", "Fis-132433", "Fis-132433", 
"Fis-132433", "Fis-132442", "Fis-132442", "Fis-132442", "Fis-132477", 
"Fis-132477", "Fis-132477", "Fis-132479", "Fis-132479", "Fis-132479", 
"Fis-132487", "Fis-132487", "Fis-132487", "Fis-132488", "Fis-132488", 
"Fis-132488", "Fis-132490", "Fis-132490", "Fis-132490", "Fis-132496", 
"Fis-132597", "Fis-132597", "Fis-132597", "Fis-132597", "Fis-132605", 
"Fis-132605", "Fis-132605", "Fis-132609", "Fis-132609", "Fis-132609", 
"Fis-132643", "Fis-132643", "Fis-132754", "Fis-132754", "Fis-132754", 
"Fis-132762", "Fis-132762", "Fis-132762", "Fis-132763", "Fis-132763", 
"Fis-132763", "Fis-132766", "Fis-132770", "Fis-132808", "Fis-132808", 
"Fis-132808", "Fis-132813", "Fis-132813", "Fis-132813", "Fis-132813", 
"Fis-132814", "Fis-132814", "Fis-132814", "Fis-132966", "Fis-133336", 
"Fis-133336", "Fis-133336", "Fis-133336", "Fis-133346", "Fis-133346", 
"Fis-133350", "Fis-133350", "Fis-133350", "Fis-133356", "Fis-133356", 
"Fis-133356", "Fis-133356", "Fis-133480", "Fis-133508", "Fis-133510", 
"Fis-133512", "Fis-133521", "Fis-133521", "Fis-133521", "Fis-133526", 
"Fis-133537", "Fis-133549", "Fis-133567", "Fis-133567", "Fis-133567", 
"Fis-133571", "Fis-133571", "Fis-133571", "Fis-133572", "Fis-133572", 
"Fis-133572", "Fis-133755", "Fis-133755", "Fis-133755", "Fis-133755", 
"Fis-134025", "Fis-134025", "Fis-134025", "Fis-134029", "Fis-134029", 
"Fis-134029", "Fis-134033", "Fis-134033", "Fis-134057", "Fis-134092", 
"Fis-134103", "Fis-134108", "Fis-134125", "Fis-134130"), rgn_id = c("110", 
"112", "136", "138", "136", "134", "136", "134", "134", "138", 
"136", "134", "136", "138", "134", "138", "136", "138", "138", 
"134", "136", "138", "138", "134", "136", "250", "136", "110", 
"112", "250", "110", "112", "250", "112", "110", "112", "250", 
"110", "112", "110", "136", "250", "250", "136", "112", "110", 
"250", "110", "112", "136", "134", "136", "138", "134", "138", 
"136", "134", "138", "136", "134", "138", "110", "250", "112", 
"112", "110", "250", "112", "110", "250", "110", "250", "136", 
"112", "250", "112", "110", "112", "110", "250", "112", "110", 
"250", "110", "250", "112", "110", "136", "112", "110", "250", 
"110", "112", "112", "250", "110", "136", "112", "110", "250", 
"112", "110", "134", "250", "138", "136", "112", "250", "112", 
"136", "110", "112", "250", "110", "110", "250", "112", "112", 
"250", "110", "112", "250", "110", "136", "134", "138", "250", 
"110", "112", "250", "112", "110", "138", "134", "136", "138", 
"134", "136", "136", "134", "138", "134", "136", "138", "138", 
"136", "138", "138", "250", "112", "110", "134", "138", "136", 
"250", "112", "110", "112", "110", "250", "136", "136", "134", 
"250", "110", "112", "112", "110", "250", "250", "110", "136", 
"112", "112", "250", "110", "250", "136", "110", "112", "250", 
"136", "112", "136", "134", "136", "134", "138", "136", "134", 
"110", "112", "134", "250", "136", "138", "138", "136", "134", 
"138", "110", "250", "112", "138", "134", "138", "136", "134", 
"138", "136", "134", "136", "138", "138", "134", "136", "138", 
"134", "136", "134", "136", "138", "134", "138", "136", "134", 
"136", "136", "134", "134", "112", "136", "110", "250", "110", 
"138", "112", "110", "136", "134", "138", "138", "136", "134", 
"138", "136", "134", "136", "134", "138", "138", "112", "250", 
"110", "134", "136", "138", "112", "250", "112", "110", "250", 
"112", "110", "134", "134", "138", "136", "138", "134", "136", 
"138", "138", "136", "134", "250", "112", "110", "138", "138", 
"134", "136", "134", "136", "136", "134", "134", "136", "110", 
"112", "250", "136", "138", "134", "136", "136", "134", "110", 
"112", "250", "110", "250", "136", "112", "250", "112", "110", 
"136", "138", "250", "112", "110", "112", "110", "250", "136", 
"138", "134", "112", "110", "250", "136", "134", "112", "134", 
"136", "134", "136", "134", "136", "136", "138", "134", "250", 
"112", "138", "110", "112", "250", "110", "134", "112", "110", 
"250", "136", "138", "134", "136", "134", "138", "250", "110", 
"112", "134", "138", "136", "110", "112", "250", "136", "250", 
"110", "112", "136", "134", "138", "134", "138", "136", "110", 
"112", "110", "112", "112", "250", "110", "110", "112", "110", 
"250", "112", "138", "134", "136", "250", "110", "112", "112", 
"110", "250", "110", "250", "112", "138", "136", "138", "134", 
"134", "136", "138", "138", "136", "134", "134", "136", "138", 
"134", "136", "138", "136", "138", "134", "138", "136", "134", 
"136", "110", "136", "112", "250", "250", "110", "112", "110", 
"112", "250", "134", "136", "112", "250", "110", "250", "110", 
"112", "134", "136", "138", "138", "138", "134", "138", "136", 
"110", "250", "112", "136", "250", "110", "112", "138", "110", 
"138", "250", "112", "136", "134", "250", "110", "112", "136", 
"110", "112", "250", "138", "138", "138", "138", "138", "136", 
"134", "138", "138", "138", "110", "250", "112", "112", "250", 
"110", "112", "250", "110", "250", "112", "110", "136", "136", 
"138", "134", "136", "138", "134", "136", "134", "138", "138", 
"138", "138", "138", "134")), row.names = c(NA, -500L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x00000281a3641ef0>)

标签: r

解决方案


以下代码应该可以回答您的问题。使用 self-join in 很容易解决tidyverse,不需要额外的包widyr(我不知道)

library(dplyr)  #NB: use library(tidyverse) if you want full functionality

df_distinct <- sample.dat.sub %>% distinct(species_id, rgn_id)

inner_join(df_distinct, df_distinct , by="species_id", suffix=c("_1","_2")) %>% 
  filter(rgn_id_1 < rgn_id_2) %>% 
  group_by(rgn_id_1, rgn_id_2) %>% 
  summarise(nb_common_species = n())

推荐阅读