r - 在 R 中跨多个 CSV 优化变量和年份合并代码
问题描述
经济学研究助理在这里。我正在使用 R 合并 College Scorecard 和 IPEDS 数据集以输出到单个 CSV 以在 Stata 中运行 OLS。我对大学记分卡合并代码非常满意,但是虽然 IPEDS 合并代码有效,但它非常糟糕,我很乐意帮助优化它。下面是其中一个变量的一年的合并代码片段。如果查看数据集的结构会有所帮助,请告诉我,我会看看如何发布数据集。
# 1.2.1.1 - 2008
ipeds2008ControlTuitionSTFRAdmissions = read.csv(file = file.path(ipedsDir, "2008ControlTuitionSTFRAdmissions.csv"), stringsAsFactors = FALSE)
ipeds2008Gender = read.csv(file = file.path(ipedsDir, "2008Gender.csv"), stringsAsFactors = FALSE)
ipeds2008Grad = read.csv(file = file.path(ipedsDir, "2008Grad.csv"), stringsAsFactors = FALSE)
ipeds2008Race = read.csv(file = file.path(ipedsDir, "2008Race.csv"), stringsAsFactors = FALSE)
.
.
.
# 1.2.2.1 - 2008
ipeds2008GenderGrad = merge(ipeds2008Gender,
ipeds2008Grad)
ipeds2008GenderGradRace = merge(ipeds2008GenderGrad,
ipeds2008Race)
ipeds2008 = merge(ipeds2008GenderGradRace,
ipeds2008ControlTuitionSTFRAdmissions)
# Garbage management.
rm(ipeds2008Gender,
ipeds2008ControlTuitionSTFRAdmissions,
ipeds2008GenderGrad,
ipeds2008GenderGradRace,
ipeds2008Race,
ipeds2008Grad)
.
.
.
# Now, let's merge all the IPEDS years.
ipedsMerged = Reduce(function(x, y) merge(x, y, all=TRUE), list(ipeds2008,
ipeds2009,
ipeds2010,
ipeds2011,
ipeds2012,
ipeds2013,
ipeds2014,
ipeds2015,
ipeds2016,
ipeds2017))
.
.
.
# Now, let's merge all the IPEDS years.
ipedsMerged = Reduce(function(x, y) merge(x, y, all=TRUE), list(ipeds2008,
ipeds2009,
ipeds2010,
ipeds2011,
ipeds2012,
ipeds2013,
ipeds2014,
ipeds2015,
ipeds2016,
ipeds2017))
# Because we currently have each variable year as a separate variable (with nulls in observation years that
# aren't in that year), we want to merge all the variable years into one variable, if that makes sense. Dplyr's
# coalesce function is very good at that. Let's start with the public vs private variable.
ipedsMerged$institutionalControl = coalesce(ipedsMerged$IC2008.Institutional.control.or.affiliation,
ipedsMerged$IC2009.Institutional.control.or.affiliation,
ipedsMerged$IC2010.Institutional.control.or.affiliation,
ipedsMerged$IC2011.Institutional.control.or.affiliation,
ipedsMerged$IC2012.Institutional.control.or.affiliation,
ipedsMerged$IC2013.Institutional.control.or.affiliation,
ipedsMerged$IC2014.Institutional.control.or.affiliation,
ipedsMerged$IC2015.Institutional.control.or.affiliation,
ipedsMerged$IC2016.Institutional.control.or.affiliation,
ipedsMerged$IC2017.Institutional.control.or.affiliation)
# Dropping all the rows that we don't need now.
ipedsMerged = ipedsMerged[ , !(names(ipedsMerged) %in% c("IC2008.Institutional.control.or.affiliation",
"IC2009.Institutional.control.or.affiliation",
"IC2010.Institutional.control.or.affiliation",
"IC2011.Institutional.control.or.affiliation",
"IC2012.Institutional.control.or.affiliation",
"IC2013.Institutional.control.or.affiliation",
"IC2014.Institutional.control.or.affiliation",
"IC2015.Institutional.control.or.affiliation",
"IC2016.Institutional.control.or.affiliation",
"IC2017.Institutional.control.or.affiliation",
"IDX_EF",
"IDX_GR"))]
.
.
.
ipedsTbl = as_tibble(ipedsMerged)
# Let's do some more garbage management.
rm(ipeds2008,
ipeds2009,
ipeds2010,
ipeds2011,
ipeds2012,
ipeds2013,
ipeds2014,
ipeds2015,
ipeds2016,
ipeds2017,
ipedsMerged)
# 1.3 - For the piece de resistance, let's merge ipeds and college scorecard.
ipedsCS = left_join(ipedsTbl, csTbl, by = c("unitid" = "unitid", "year" = "year"))
write_csv(ipedsCS, path = file.path(thisDir, "Data", "Merged", "ipedsCS.csv"))
解决方案
推荐阅读
- c - 为什么不执行错误之前的解析器操作?
- reactjs - 在 react-native 中,如何正确排序取决于数据的数据获取和 useState
- flutter - 什么相当于返回(弹出)GetX 中的页面?
- r - 为什么我的电脑内存快到极限时会减少?
- selenium - 使用 Selenium C# DevTools 捕获 F12 选项网络下的所有标头
- python - 带有输入的无限循环并引发值错误-python
- c# - C#如何抄送自己
- scala - 从超类列表中提取给定子类的元素
- python - 如何从超链接显示为“Javascript”的网站下载文件
- linux - 有没有办法使用 bash 脚本根据发布日期值对 *.md 文件进行排序?