首页 > 技术文章 > 利用tcga数据做生存分析

qiniqnyang 2017-01-12 10:30 原文

####teach code####
library(survival)
#read expression data and modify its class and colnames/rownames

#PAAD_gene_expression.csv数据已经经过Z_转换的数据。

z_rna <- read.csv(file="PAAD_gene_expression.csv",header = T,row.names = 1)

#clincal是所有样本的临床信息
clinical <- t(read.table("PAAD.merged_only_clinical_clin_format.txt",header=T, row.names=1, sep="\t",stringsAsFactors = F))

clinical <- as.data.frame(clinical)
#transform rna data's colnames and rownames
z_rna <- as.matrix(z_rna)
colnames(z_rna) <- gsub("\\.","-",colnames(z_rna))
rownames(z_rna) <- sapply(rownames(z_rna), function(x) unlist(strsplit(x,"\\|"))[[1]])


#check the type of cancer data
#table(substr(colnames(z_rna),14,15))
#grep patient ID in clinical data
clinical$IDs <- toupper(clinical$patient.bcr_patient_barcode)
sum(clinical$IDs %in% colnames(z_rna))# we have 178 patients that we could use
#grep information for survival

ind_keep <- grep("days_to_death",colnames(clinical))
death <- as.matrix(clinical[,ind_keep])
death_collapsed <- apply(death,1,FUN = function(x)max(x,na.rm = T)) #提取最大值
death_collapsed<- as.numeric(death_collapsed)

#last follow_up information
ind_keep <- grep("days_to_last_followup",colnames(clinical))
fl <- as.matrix(clinical[,ind_keep])
fl_collapsed <- apply(fl,1,FUN = function(x)max(x,na.rm = T))
fl_collapsed <- as.numeric(fl_collapsed)


#construct a new dataframe containing all information
all_clin <- data.frame(death_collapsed,fl_collapsed)
colnames(all_clin) <- c("death_days", "followUp_days")
all_clin$death_event <- ifelse(clinical$patient.vital_status == "alive", 0,1)
all_clin$new_death <- apply(all_clin,1,function(x)ifelse(x[3]==0,x[2],x[1]))
table(clinical$patient.vital_status)
rownames(all_clin) <- clinical$IDs
##connect to RNA-seq
# create event vector for RNASeq data
event_rna <- t(apply(z_rna, 1, function(x) ifelse(abs(x) > 1.96,1,0)))
in_clin_tum <- match(colnames(z_rna),rownames(all_clin))
tumor_clinical <- all_clin[in_clin_tum,]
#example gene
ind_gene <- which(rownames(z_rna) == "TP53")
table(event_rna[ind_gene,])


###survival analysis###


#over_all survival
su_OS <- Surv(as.numeric(tumor_clinical$new_death),as.numeric(tumor_clinical$death_event))
fit_OS <- survfit(su_OS~event_rna[ind_gene,])
p <- survdiff(su_OS~event_rna[ind_gene,])
pv <- ifelse(is.na(p),next,(round(1 - pchisq(p$chisq, length(p$n) - 1),3)))[[1]]
x1 <- ifelse(is.na(as.numeric(summary(fit_OS)$table[,'median'][1])),"NA",as.numeric(summary(fit_OS)$table[,'median'][1]))
x2 <- as.numeric(summary(fit_OS)$table[,'median'][2])

#get necessary information
summary(fit_OS)
summary(fit_OS)$table[,'median']
summary(fit_OS,times = 365)
#plot survival curve

plot(fit_OS,conf.int=FALSE,col=c("black","red"),xlab="Days",ylab = "Proportion Survival")
legend(1800,0.995,legend=paste("p.value = ",pv[[1]],sep=""),bty="n",cex=1.4)
legend(max(as.numeric(as.character(all_clin$death_days)[in_clin_tum]),na.rm = T)*0.7,0.94,
legend=c(paste("NotAltered=",x1),paste("Altered=",x2)),bty="n",cex=1.3,lwd=3,col=c("black","red"))

推荐阅读