简体   繁体   中英

Running a loop on multiple .txt files in one folder in R

I have a script that calculates the copy number variation and saves the data into an existing file named "genesforcomp1" based on first column information. The input files named BRCA1.txt, BRCA2.txt, BRCA3.txt.......BRCA4376.txt. The other input file "genes.txt" is the same in each cycle and used for the annotation, while "genesforcomp1" is used for updating the output. Due to the large number of files, I would like to know if I can do it by the loop function in R. Here is my script

setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
df<- read.table("BRCA1.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <-  makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)

Any suggestions or ideas of how to loop the script will be appreciated. Thanks in advance!

As your filenames follow a nice pattern, you can do a loop from 1 to 4376, and substitute the "BRCA1.txt" in your code with paste0("BRCA", i, ".txt") . There probably are ways to loop without hard coding the pattern, but in your case you don't seem to need it.

setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
for(i in 1:4376){
  df<- read.table(paste0("BRCA", i, ".txt"), sep="\t", stringsAsFactors=FALSE, header=TRUE)
  df$Chromosome <- gsub('X', '23', df$Chromosome)
  df$Chromosome <- gsub('Y', '24', df$Chromosome)
  colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
  cnv <-  makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
  hits <- findOverlaps(genes_GR, cnv, type="within")
  df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
  df_ann <- unique(df_ann)
  df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
  colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
  df_ann$Seg_value2 <- abs(df_ann$Seg_value)
  df_ann$Seg_value2 = 2^df_ann$Seg_value2
  df_ann$Seg_value2 = df_ann[, 4] - 1
  df_ann$Seg_value2 = df_ann[, 4] * 2
  df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
  df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
  df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
  df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
  df <- rbind.data.frame(df_ann, df_ann1)
  df <- df[!duplicated(df$Ensembl_ID),]
  #saving the results into existing file based on first column values
  df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
  lst <- list(data.frame(df1), data.frame(df))
  df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
  write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM