# make fake data; notice group 4 has only 8 rows
grp <- c(rep(0:3, each = 10), rep(4,8), rep(5:6, each = 10))
y <- round(rnorm(length(grp)),2)
d <- data.frame(y, grp)Consultation summary 2023-08-28
task 1: drop any groups with less than 10 rows
Want to drop group 4 since it only has 8 rows instead of 10
table(d$grp)
0 1 2 3 4 5 6
10 10 10 10 8 10 10
Here’s how I did it.
# calculate table
tab <- table(d$grp)
# save names of table with entries not equal to 10
drop <- names(tab)[tab != 10]
# subset data frame where grp is not in drop
d2 <- subset(d, !(grp %in% drop))
# group 4 is dropped
table(d2$grp)
0 1 2 3 5 6
10 10 10 10 10 10
task 2: drop rows with runs of zeroes greater than or equal to 5
# make fake data with lots of zeroes
set.seed(666)
y <- sample(0:4, size = 1000, replace = TRUE, prob = c(16,1,1,1,1)/20)
d <- data.frame(y)
head(d, n = 20) y
1 0
2 0
3 1
4 0
5 0
6 0
7 1
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 2
17 0
18 3
19 0
20 0
Notice we start off with a run of 2 zeroes, then 1 one, then 3 zeroes, then 1 one, then 8 zeroes, …
We can use the rle() function to calculate runs.
r_out <- rle(d$y)
r_outRun Length Encoding
lengths: int [1:359] 2 1 3 1 8 1 1 1 2 1 ...
values : int [1:359] 0 1 0 1 0 2 0 3 0 1 ...
lengths measures the runs of the values:
Notice we start off with a run of 2 zeroes, then 1 one, then 3 zeroes, then 1 one, then 8 zeroes, …
Again, the task is to drop rows where the run of zeroes is 5 or more. Here’s how I did it. Add the lengths to the data frame, repeating by length so each row will be identified by its run group.
d$cnt <- rep(r_out$lengths, r_out$lengths)
head(d, n = 20) y cnt
1 0 2
2 0 2
3 1 1
4 0 3
5 0 3
6 0 3
7 1 1
8 0 8
9 0 8
10 0 8
11 0 8
12 0 8
13 0 8
14 0 8
15 0 8
16 2 1
17 0 1
18 3 1
19 0 2
20 0 2
Now we can subset the data frame on the condition that cnt < 5.
d2 <- subset(d, cnt < 5)
r_out2 <- rle(d2$y)
any(r_out2$lengths >= 5)[1] FALSE