Created a resampled tibble

samplify() creates a resampled tibble with virtual groups.

samplify(data, times, size, ..., replace = FALSE, key = ".sample")

Arguments

data	A tbl.
times	A single integer specifying the number of resamples. If the `tibble` is grouped, this is the number of resamples per group.
size	A single integer specifying the size of each resample. For a grouped data frame, this is also allowed to be an integer vector with size equal to the number of groups in `data`. This can be helpful when sampling without replacement when the number of rows per group is very different.
...	Not used.
replace	Whether or not to sample with replacement.
key	A single character specifying the name of the virtual group that is added.

Value

A resampled_df with an extra group specified by the key.

Details

The following functions have special / interesting behavior when used with a resampled_df:

Examples

library(dplyr)
library(broom)

samplify(iris, times = 3, size = 20)
#> # A tibble: 150 x 5
#> # Groups:   .sample [3]
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1          5.1         3.5          1.4         0.2 setosa 
#>  2          4.9         3            1.4         0.2 setosa 
#>  3          4.7         3.2          1.3         0.2 setosa 
#>  4          4.6         3.1          1.5         0.2 setosa 
#>  5          5           3.6          1.4         0.2 setosa 
#>  6          5.4         3.9          1.7         0.4 setosa 
#>  7          4.6         3.4          1.4         0.3 setosa 
#>  8          5           3.4          1.5         0.2 setosa 
#>  9          4.4         2.9          1.4         0.2 setosa 
#> 10          4.9         3.1          1.5         0.1 setosa 
#> # … with 140 more rows

iris %>%
  samplify(times = 3, size = 20) %>%
  summarise(per_strap_mean = mean(Petal.Width))
#> # A tibble: 3 x 2
#>   .sample per_strap_mean
#>     <int>          <dbl>
#> 1       1           1.17
#> 2       2           1.36
#> 3       3           1.26

iris %>%
  group_by(Species) %>%
  samplify(times = 3, size = 20) %>%
  summarise(per_strap_species_mean = mean(Petal.Width))
#> # A tibble: 9 x 3
#> # Groups:   Species [3]
#>   Species    .sample per_strap_species_mean
#>   <fct>        <int>                  <dbl>
#> 1 setosa           1                  0.265
#> 2 setosa           2                  0.215
#> 3 setosa           3                  0.25 
#> 4 versicolor       1                  1.29 
#> 5 versicolor       2                  1.36 
#> 6 versicolor       3                  1.36 
#> 7 virginica        1                  2.10 
#> 8 virginica        2                  2.04 
#> 9 virginica        3                  2.05 

# Alter the name of the group with `key`
# Materialize them with collect()
samps <- samplify(iris, times = 3, size = 5, key = ".samps")
collect(samps)
#> # A tibble: 15 x 6
#> # Groups:   .samps [3]
#>    .samps Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
#>     <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>     
#>  1      1          5.9         3            4.2         1.5 versicolor
#>  2      1          5.7         2.8          4.5         1.3 versicolor
#>  3      1          6.2         2.2          4.5         1.5 versicolor
#>  4      1          4.4         3.2          1.3         0.2 setosa    
#>  5      1          4.8         3            1.4         0.1 setosa    
#>  6      2          5           2            3.5         1   versicolor
#>  7      2          5.8         2.7          5.1         1.9 virginica 
#>  8      2          5.5         2.6          4.4         1.2 versicolor
#>  9      2          4.9         3.6          1.4         0.1 setosa    
#> 10      2          6           3            4.8         1.8 virginica 
#> 11      3          6.5         3            5.5         1.8 virginica 
#> 12      3          5.2         2.7          3.9         1.4 versicolor
#> 13      3          5.5         4.2          1.4         0.2 setosa    
#> 14      3          5.4         3.7          1.5         0.2 setosa    
#> 15      3          5.6         3            4.5         1.5 versicolor

collect(samps, id = ".id", original_id = ".orig_id")
#> # A tibble: 15 x 8
#> # Groups:   .samps [3]
#>    .samps   .id .orig_id Sepal.Length Sepal.Width Petal.Length Petal.Width
#>     <int> <int>    <int>        <dbl>       <dbl>        <dbl>       <dbl>
#>  1      1     1       62          5.9         3            4.2         1.5
#>  2      1     2       56          5.7         2.8          4.5         1.3
#>  3      1     3       69          6.2         2.2          4.5         1.5
#>  4      1     4       43          4.4         3.2          1.3         0.2
#>  5      1     5       13          4.8         3            1.4         0.1
#>  6      2     1       61          5           2            3.5         1  
#>  7      2     2      102          5.8         2.7          5.1         1.9
#>  8      2     3       91          5.5         2.6          4.4         1.2
#>  9      2     4       38          4.9         3.6          1.4         0.1
#> 10      2     5      139          6           3            4.8         1.8
#> 11      3     1      117          6.5         3            5.5         1.8
#> 12      3     2       60          5.2         2.7          3.9         1.4
#> 13      3     3       34          5.5         4.2          1.4         0.2
#> 14      3     4       11          5.4         3.7          1.5         0.2
#> 15      3     5       67          5.6         3            4.5         1.5
#> # … with 1 more variable: Species <fct>

#----------------------------------------------------------------------------

# Be careful not to specify a `size` larger
# than one of your groups! This will throw an error.

iris_group_sizes_of_50_and_5 <- iris[1:55,] %>%
  group_by(Species) %>%
  group_trim()

count(iris_group_sizes_of_50_and_5, Species)
#> # A tibble: 2 x 2
#> # Groups:   Species [2]
#>   Species        n
#>   <fct>      <int>
#> 1 setosa        50
#> 2 versicolor     5

# size = 10 > min_group_size = 5
# \dontrun{
iris_group_sizes_of_50_and_5 %>%
  samplify(times = 2, size = 10)
#> Error: `size` (10) must be less than or equal to the size of the data / current group (5), set `replace = TRUE` to use sampling with replacement.
# }

# Instead, pass a vector of sizes to `samplify()` if this
# structure is absolutely required for your use case.

# size of 10 for the first group
# size of 5 for the second group
# total number of rows is 10 * 2 + 5 * 2 = 30
iris_group_sizes_of_50_and_5 %>%
  samplify(times = 2, size = c(10, 5)) %>%
  collect()
#> # A tibble: 30 x 6
#> # Groups:   Species, .sample [4]
#>    .sample Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>      <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#>  1       1          4.8         3.4          1.9         0.2 setosa 
#>  2       1          5           3            1.6         0.2 setosa 
#>  3       1          4.8         3            1.4         0.3 setosa 
#>  4       1          5.1         3.5          1.4         0.3 setosa 
#>  5       1          4.9         3.1          1.5         0.2 setosa 
#>  6       1          5.7         3.8          1.7         0.3 setosa 
#>  7       1          5           3.2          1.2         0.2 setosa 
#>  8       1          5.7         4.4          1.5         0.4 setosa 
#>  9       1          4.8         3            1.4         0.1 setosa 
#> 10       1          5.3         3.7          1.5         0.2 setosa 
#> # … with 20 more rows

Arguments

Value

Details

See also

Examples

Contents