# make fake data; notice group 4 has only 8 rows
<- c(rep(0:3, each = 10), rep(4,8), rep(5:6, each = 10))
grp <- round(rnorm(length(grp)),2)
y <- data.frame(y, grp) d
Consultation summary 2023-08-28
task 1: drop any groups with less than 10 rows
Want to drop group 4 since it only has 8 rows instead of 10
table(d$grp)
0 1 2 3 4 5 6
10 10 10 10 8 10 10
Here’s how I did it.
# calculate table
<- table(d$grp)
tab # save names of table with entries not equal to 10
<- names(tab)[tab != 10]
drop # subset data frame where grp is not in drop
<- subset(d, !(grp %in% drop))
d2 # group 4 is dropped
table(d2$grp)
0 1 2 3 5 6
10 10 10 10 10 10
task 2: drop rows with runs of zeroes greater than or equal to 5
# make fake data with lots of zeroes
set.seed(666)
<- sample(0:4, size = 1000, replace = TRUE, prob = c(16,1,1,1,1)/20)
y <- data.frame(y)
d head(d, n = 20)
y
1 0
2 0
3 1
4 0
5 0
6 0
7 1
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 2
17 0
18 3
19 0
20 0
Notice we start off with a run of 2 zeroes, then 1 one, then 3 zeroes, then 1 one, then 8 zeroes, …
We can use the rle()
function to calculate runs.
<- rle(d$y)
r_out r_out
Run Length Encoding
lengths: int [1:359] 2 1 3 1 8 1 1 1 2 1 ...
values : int [1:359] 0 1 0 1 0 2 0 3 0 1 ...
lengths
measures the runs of the values
:
Notice we start off with a run of 2 zeroes, then 1 one, then 3 zeroes, then 1 one, then 8 zeroes, …
Again, the task is to drop rows where the run of zeroes is 5 or more. Here’s how I did it. Add the lengths to the data frame, repeating by length so each row will be identified by its run group.
$cnt <- rep(r_out$lengths, r_out$lengths)
dhead(d, n = 20)
y cnt
1 0 2
2 0 2
3 1 1
4 0 3
5 0 3
6 0 3
7 1 1
8 0 8
9 0 8
10 0 8
11 0 8
12 0 8
13 0 8
14 0 8
15 0 8
16 2 1
17 0 1
18 3 1
19 0 2
20 0 2
Now we can subset the data frame on the condition that cnt < 5
.
<- subset(d, cnt < 5)
d2 <- rle(d2$y)
r_out2 any(r_out2$lengths >= 5)
[1] FALSE