SamplingStrata workflow with drake

Workflow definition

First we define packages:

library(drake)
library(SamplingStrata)

then ‘swissmunicipalities’ example data (with only 2 domains):

data(swissmunicipalities)
swissmun <- swissmunicipalities[swissmunicipalities$REG < 3,
                                c("REG","COM","Nom","HApoly",
                                  "Surfacesbois","Surfacescult",
                                  "Airbat","POPTOT")]
head(swissmun)
#>    REG  COM       Nom HApoly Surfacesbois Surfacescult Airbat POPTOT
#> 2    1 6621    Geneve   1593           67           31    773 177964
#> 4    2  351      Bern   5162         1726         1041   1070 128634
#> 5    1 5586  Lausanne   4136         1635          714    856 124914
#> 9    2  371 Biel (BE)   2123          976          196    463  48655
#> 10   2  942      Thun   2158          425          694    523  40377
#> 11   2  355     Koniz   5099         1567         2621    515  37782

Finally, the plan:

plan <- drake_plan(
  # Definition of the sampling frame
  frame = buildFrameDF(
    df = swissmun,
    id = "COM",
    domainvalue= "REG",
    X = c("HApoly","POPTOT"),
    Y =c("Surfacesbois", "Airbat")),
  # Definition of precision constraints
  cv = as.data.frame(list(
    DOM = rep("DOM1",length(unique(swissmun$REG))),
    CV1 = rep(0.10,length(unique(swissmun$REG))),
    CV2 = rep(0.10,length(unique(swissmun$REG))),
    domainvalue= c(1:length(unique(swissmun$REG))))),
  # Solution with kmeans
  kmean = KmeansSolution2(frame=frame,
                          errors=cv,
                          maxclusters=5),
  # Determination of number of strata for each domain
  nstrat = tapply(kmean$suggestions,
                  kmean$domainvalue,
                  FUN=function(x) length(unique(x))),
  # Optimisation
  solution = optimStrata(method="continuous",
                        framesamp=frame,
                        errors= cv,
                        nStrat=nstrat,
                        iter = 25,
                        pops= 10),
  # Optimal strata
  strata_structure = summaryStrata(solution$framenew,
                                   solution$aggr_strata,
                                   progress=FALSE),
  # Expected CVs with optimal strata
  expected_CV1 = expected_CV(solution$aggr_strata),
  # Adjust allocation with affordable sample size
  sample_size = 150,
  adjustedStrata = adjustSize(size=sample_size,strata=solution$aggr_strata,cens=NULL),
  # New expected CVs
  expected_CV2 = expected_CV(adjustedStrata),
  # Evaluation by simulation
  eval = evalSolution(frame = solution$framenew, 
                     outstrata = adjustedStrata, 
                     nsampl = 500,
                     progress = FALSE), 
  # Final selection of the sample
  sample = selectSample(solution$framenew,adjustedStrata)
)

Execution of the workflow

We have just defined this plan:

print(plan)
#> # A tibble: 12 × 2
#>    target           command                                                                                                                                                                                                                        
#>    <chr>            <expr_lst>                                                                                                                                                                                                                     
#>  1 frame            buildFrameDF(df = swissmun, id = "COM", domainvalue = "REG",      X = c("HApoly", "POPTOT"), Y = c("Surfacesbois", "Airbat"))                                                                                                  
#>  2 cv               as.data.frame(list(DOM = rep("DOM1", length(unique(swissmun$REG))),      CV1 = rep(0.1, length(unique(swissmun$REG))), CV2 = rep(0.1,          length(unique(swissmun$REG))), domainvalue = c(1:length(unique(swissmun$REG)))))
#>  3 kmean            KmeansSolution2(frame = frame, errors = cv, maxclusters = 5)                                                                                                                                                                   
#>  4 nstrat           tapply(kmean$suggestions, kmean$domainvalue, FUN = function(x) length(unique(x)))                                                                                                                                              
#>  5 solution         optimStrata(method = "continuous", framesamp = frame, errors = cv,      nStrat = nstrat, iter = 25, pops = 10)                                                                                                                 
#>  6 strata_structure summaryStrata(solution$framenew, solution$aggr_strata, progress = FALSE)                                                                                                                                                       
#>  7 expected_CV1     expected_CV(solution$aggr_strata)                                                                                                                                                                                              
#>  8 sample_size      150                                                                                                                                                                                                                            
#>  9 adjustedStrata   adjustSize(size = sample_size, strata = solution$aggr_strata,      cens = NULL)                                                                                                                                                
#> 10 expected_CV2     expected_CV(adjustedStrata)                                                                                                                                                                                                    
#> 11 eval             evalSolution(frame = solution$framenew, outstrata = adjustedStrata,      nsampl = 500, progress = FALSE)                                                                                                                       
#> 12 sample           selectSample(solution$framenew, adjustedStrata)

visualised in this way:

vis_drake_graph(plan, ncol_legend = NULL,
                main="Optimization with 'continuous' method")

and now we execute:

make(plan)
#> ▶ target solution
#> 
#> Input data have been checked and are compliant with requirements
#> 
#>  *** Starting parallel optimization for  2  domains using  2  cores

#> 
#>  *** Sample size :  97
#>  *** Number of strata :  9
#> ---------------------------
#> ▶ target adjustedStrata
#> 
#>  148
#>  149
#>  149
#>  Final adjusted size:  149
#> ▶ target strata_structure
#> ▶ target expected_CV1
#> ▶ target eval
#> ▶ target expected_CV2
#> ▶ target sample

#> 
#> *** Sample has been drawn successfully ***
#>  149  units have been selected from  9  strata
#> 
#> ==> There have been  1  take-all strata 
#> from which have been selected  4 units

Output inspection

First of all, the total size of the optimized sample:

options(width = 999)
loadd(solution)
sum(solution$aggr_strata$SOLUZ)
#> [1] 97.61437

Then, the structure of the optimized strata:

loadd(strata_structure)
strata_structure
#>   Domain Stratum Population Allocation SamplingRate Lower_X1 Upper_X1 Lower_X2 Upper_X2
#> 1      1       1        369         15     0.040455       32     1212       27     2171
#> 2      1       2         89         12     0.137139      129     1604      195    10955
#> 3      1       3        118         24     0.203822      239     9692       78    29559
#> 4      1       4          9          3     0.324100     9922    20994      255     7515
#> 5      1       5          4          3     0.778272     1593    28225     5988   177964
#> 6      2       1        544         10     0.019213       34      972       30     1841
#> 7      2       2        277         13     0.045605      152     2417       84     6798
#> 8      2       3         55          6     0.109353      421     4609      332    16757
#> 9      2       4         37         11     0.305442      930    20079      272   128634

The expected CVs given the optimized strata:

loadd(expected_CV1)
expected_CV1
#>      cv(Y1) cv(Y2)
#> DOM1    0.1    0.1
#> DOM2    0.1    0.1

and after the adjustment of the allocation in the strata based on the affordable sample size:

loadd(expected_CV2)
expected_CV2
#>         cv(Y1)    cv(Y2)
#> DOM1 0.0762015 0.0755376
#> DOM2 0.0799250 0.0780134

The results of simulation with the adjusted allocation:

loadd(eval)
eval$coeff_var
#>      CV1    CV2  dom
#> 1 0.0762 0.0771 DOM1
#> 2 0.0828 0.0797 DOM2
eval$rel_bias
#>        y1     y2  dom
#> 1  0.0019 0.0045 DOM1
#> 2 -0.0097 0.0019 DOM2

Finally, the selection of the sample from the adjusted strata:

loadd(sample)
head(sample)
#>   DOMAINVALUE STRATO   ID   X1  X2  Y1 Y2 LABEL  WEIGHTS        FPC
#> 1           1      1 5494 1108 828 411 24     1 16.04348 0.06233062
#> 2           1      1 5527  368 888  73 36     1 16.04348 0.06233062
#> 3           1      1 6061  502 290 194 13     1 16.04348 0.06233062
#> 4           1      1 5923  360 170 100  9     1 16.04348 0.06233062
#> 5           1      1 5501  385 813  49 28     1 16.04348 0.06233062
#> 6           1      1 5858  269 322  34  9     1 16.04348 0.06233062

Continuous method

Giulio Barcaroli

2025-02-24

Workflow definition

Execution of the workflow

Output inspection