简体   繁体   English

在 R 的一个文件夹中的多个.txt 文件上运行循环

[英]Running a loop on multiple .txt files in one folder in R

I have a script that calculates the copy number variation and saves the data into an existing file named "genesforcomp1" based on first column information.我有一个脚本,可以计算拷贝数变异并将数据保存到基于第一列信息的名为“genesforcomp1”的现有文件中。 The input files named BRCA1.txt, BRCA2.txt, BRCA3.txt.......BRCA4376.txt.输入文件名为 BRCA1.txt、BRCA2.txt、BRCA3.txt.......BRCA4376.txt。 The other input file "genes.txt" is the same in each cycle and used for the annotation, while "genesforcomp1" is used for updating the output.另一个输入文件“genes.txt”在每个循环中都相同,用于注释,而“genesforcomp1”用于更新output。 Due to the large number of files, I would like to know if I can do it by the loop function in R.由于文件较多,想知道能否通过R中的循环function来实现。 Here is my script这是我的脚本

setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
df<- read.table("BRCA1.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <-  makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)

Any suggestions or ideas of how to loop the script will be appreciated.任何关于如何循环脚本的建议或想法将不胜感激。 Thanks in advance!提前致谢!

As your filenames follow a nice pattern, you can do a loop from 1 to 4376, and substitute the "BRCA1.txt" in your code with paste0("BRCA", i, ".txt") .由于您的文件名遵循一个很好的模式,您可以执行从 1 到 4376 的循环,并将代码中的"BRCA1.txt"替换为paste0("BRCA", i, ".txt") There probably are ways to loop without hard coding the pattern, but in your case you don't seem to need it.可能有一些方法可以在不对模式进行硬编码的情况下进行循环,但在您的情况下,您似乎不需要它。

setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
for(i in 1:4376){
  df<- read.table(paste0("BRCA", i, ".txt"), sep="\t", stringsAsFactors=FALSE, header=TRUE)
  df$Chromosome <- gsub('X', '23', df$Chromosome)
  df$Chromosome <- gsub('Y', '24', df$Chromosome)
  colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
  cnv <-  makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
  hits <- findOverlaps(genes_GR, cnv, type="within")
  df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
  df_ann <- unique(df_ann)
  df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
  colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
  df_ann$Seg_value2 <- abs(df_ann$Seg_value)
  df_ann$Seg_value2 = 2^df_ann$Seg_value2
  df_ann$Seg_value2 = df_ann[, 4] - 1
  df_ann$Seg_value2 = df_ann[, 4] * 2
  df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
  df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
  df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
  df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
  df <- rbind.data.frame(df_ann, df_ann1)
  df <- df[!duplicated(df$Ensembl_ID),]
  #saving the results into existing file based on first column values
  df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
  lst <- list(data.frame(df1), data.frame(df))
  df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
  write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)
}

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM