繁体   English   中英

删除<0个值并删除行

[英]Remove < 0 values & delete rows

编辑,在下面添加了代码

首先,对不起,我现在无法确定一个可复制的好例子,但是我想我的问题可以不回答。

我的数据涉及一些从手动操作的测试机上获得的折线图。 因为它是手动操作,所以我们获得了可变的开始时间,因此数据之间没有正确地“重叠”。

以前通过使用以下代码解决了此问题:

#import data
x <- read.csv("smoke.csv", head=T, sep=",")

#flag '0' values, remove all zero values
row_sub = apply(x, 1, function(row) all(row > 0))
y <- x[row_sub,]

由于样本量小且时间紧,因此以前可以使用。 随着更多样本的出现,我现在在图表中得到了一些“剪裁”:

picture

我不是专家,请解释一下:“ row_sub”是“ x”的修改版本,仅保留所有值均大于0的行

此处的问题在所附的图像中说明了此问题。 我们可以看到第一个样本还可以,因为它可能花费了最长的时间插入到设备中。 但是操作员在整个测试过程中都变得更好,减少了样品进料时间,从而导致样品4中出现极端的削峰现象。

我知道我可以轻松地手动完成此操作,只需删除每个样本的前导零值,然后剪辑所有数据的尾端以确保它们都具有相等的数据点即可。 但是我不知道如何在R中做到这一点。

编辑以下是数据: http : //pastebin.com/iEW4sH2a

# Check & load required packages 
if (require("grid") == FALSE) install.packages("grid")
if (require("ggplot2") == FALSE) install.packages("ggplot2")
if (require("gridExtra") == FALSE) install.packages("gridExtra")
if (require("flux") == FALSE) install.packages("flux")
if (require("matrixStats") == FALSE) install.packages("matrixStats")
if (require("mgcv") == FALSE) install.packages("mgcv")


# Set working directory, read datafile
setwd("C location here")
x <- read.csv("smoke.csv", head=T, sep=",")
# Remove 'time' column

# flag '0' values, remove zero values
row_sub = apply(x, 1, function(row) all(row > 0, na.rm=TRUE))
y <- x[row_sub,]
rownames(y) <- NULL

# create time axis with appropriate length & attach to df
time <- seq(0,120, by=0.2)
time <- time[0:nrow(y)]
z <- cbind(time, y)
z <- na.omit(z)

#graph parameters
y_max <- 5.0

a.means <- rowMeans(z[,2:5])
b.means <- rowMeans(z[,6:9])
c.means <- rowMeans(z[,10:13])
d.means <- rowMeans(z[,14:17])

all.data <- cbind(z, a.means, b.means, c.means, d.means)

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    require(grid)

    # Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    # If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        # Make the panel
        # ncol: Number of columns of plots
        # nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
        # Set up the page
        grid.newpage()
        pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

        # Make each plot, in the correct location
        for (i in 1:numPlots) {
            # Get the i,j matrix positions of the regions that contain this subplot
            matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

            print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                            layout.pos.col = matchidx$col))
        }
    }
}

#calculate area under curve
a.auc <- round(auc(z$time, a.means),2)
b.auc <- round(auc(z$time, b.means),2)
c.auc <- round(auc(z$time, c.means),2)
d.auc <- round(auc(z$time, d.means),2)
# Prepare plots

a_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=a1), alpha=0.1, color="indianred") + 
    geom_point(aes(y=a2), alpha=0.1, color="indianred1") + 
    geom_point(aes(y=a3), alpha=0.1, color="indianred2") + 
    geom_point(aes(y=a4), alpha=0.1, color="indianred3") +
    geom_line(aes(y=a.means), size=1, color="indianred4") +
    ggtitle("145A: Standard") +
    geom_text(aes(75, 1.5, label = a.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

b_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=b1), alpha=0.1, color="chartreuse") + 
    geom_point(aes(y=b2), alpha=0.1, color="chartreuse1") + 
    geom_point(aes(y=b3), alpha=0.1, color="chartreuse2") + 
    geom_point(aes(y=b4), alpha=0.1, color="chartreuse3") +
    geom_line(aes(y=b.means), size=1, color="chartreuse4") +
    ggtitle("145B: +0.5%") +
    geom_text(aes(75, 1.5, label = b.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

c_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=c1), alpha=0.1, color="turquoise") + 
    geom_point(aes(y=c2), alpha=0.1, color="turquoise1") + 
    geom_point(aes(y=c3), alpha=0.1, color="turquoise2") + 
    geom_point(aes(y=c4), alpha=0.1, color="turquoise3") +
    geom_line(aes(y=c.means), size=1, color="turquoise4") +
    ggtitle("145C: +1.0%") +
    geom_text(aes(75, 1.5, label = c.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

d_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=d1), alpha=0.1, color="indianred") + 
    geom_point(aes(y=d2), alpha=0.1, color="indianred1") + 
    geom_point(aes(y=d3), alpha=0.1, color="indianred2") + 
    geom_point(aes(y=d4), alpha=0.1, color="indianred3") +
    geom_line(aes(y=d.means), size=1, color="indianred4") +
    ggtitle("145A: Standard") +
    geom_text(aes(75, 1.5, label = d.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

sample_names <- as.data.frame(c("145A", "145B", "145C", "145D"))
sample_auc <- as.data.frame(c(a.auc, b.auc, c.auc, d.auc))
sample_all <- as.data.frame(cbind(sample_names,sample_auc))
colnames(sample_all) <- c("x","y")

multiplot(a_graph, b_graph, c_graph, d_graph, cols=2)

我仍然不是100%肯定我理解这个问题,但我认为我对它的理解更好。

根据我的理解,除了时间以外,您的数据列会通过改变数量而开始移动,这些数量您一开始并不关心。

如果是这种情况,您可以做的是定义一个阈值小阈值thresh在此之后您要将数据视为每列的开始并丢弃之前的所有内容。

## Untested ##
x <- lapply(x, as.numeric)
thresh <- 0.01
## store all indices until thresh is exceeded
ind2Rm <- lapply(x, function(col) 1:which(col > thresh)[1])
for(j in 2:length(x)) { # don't loop over time which is 1st column
  x[[j]] <- x[[j]][-ind2Rm[[j]]] # remove these first values that don't exceed thresh
}

此后,您将需要组合数据以将其重新绘制到数据框中。 由于列表元素的长度可能不同,因此您可以通过在每列的末尾填充NA来将它们组合成一个数据帧。 有关方法的一种方法,请参见此SO问题答案

也许这就是您想要的?

dt <- list(ax = x[c(1,grep("a", colnames(x)))], bx = x[c(1,grep("b", colnames(x)))], cx = x[c(1,grep("c", colnames(x)))], dx = x[c(1,grep("d", colnames(x)))])

z <- lapply(dt, function(k) {
    out <- k[apply(k[-1], 1, function(row) all(row > 0, na.rm=TRUE)),]
    out$time <- seq(from = 0, by = 0.2, length = nrow(out))
    out
    })

Reduce(function(x, y) merge(x, y, by="time", all = TRUE), z)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM