Workflow definition

First we define packages:

then ‘swissmunicipalities’ example data (with only 2 domains):

data(swissmunicipalities)
swissmun <- swissmunicipalities[swissmunicipalities$REG < 3,
                                c("REG","COM","Nom","HApoly",
                                  "Surfacesbois","Surfacescult",
                                  "Airbat","POPTOT")]
head(swissmun)
#>    REG  COM       Nom HApoly Surfacesbois Surfacescult Airbat POPTOT
#> 2    1 6621    Geneve   1593           67           31    773 177964
#> 4    2  351      Bern   5162         1726         1041   1070 128634
#> 5    1 5586  Lausanne   4136         1635          714    856 124914
#> 9    2  371 Biel (BE)   2123          976          196    463  48655
#> 10   2  942      Thun   2158          425          694    523  40377
#> 11   2  355     Koniz   5099         1567         2621    515  37782

Finally, the plan:

plan <- drake_plan(
  # Definition of the sampling frame
  frame = buildFrameDF(
    df = swissmun,
    id = "COM",
    domainvalue= "REG",
    X = c("HApoly","POPTOT"),
    Y =c("Surfacesbois", "Airbat")),
  # Definition of precision constraints
  cv = as.data.frame(list(
    DOM = rep("DOM1",length(unique(swissmun$REG))),
    CV1 = rep(0.10,length(unique(swissmun$REG))),
    CV2 = rep(0.10,length(unique(swissmun$REG))),
    domainvalue= c(1:length(unique(swissmun$REG))))),
  # Solution with kmeans
  kmean = KmeansSolution2(frame=frame,
                          errors=cv,
                          maxclusters=5),
  # Determination of number of strata for each domain
  nstrat = tapply(kmean$suggestions,
                  kmean$domainvalue,
                  FUN=function(x) length(unique(x))),
  # Initial solutions
  sugg = prepareSuggestion(kmean=kmean,
                           frame=frame,
                           nstrat=nstrat),
  # Optimisation
  solution = optimStrata(method="continuous",
                        framesamp=frame,
                        errors= cv,
                        suggestions = sugg,
                        nStrat=nstrat,
                        iter = 25,
                        pops= 10),
  # Optimal strata
  strata_structure = summaryStrata(solution$framenew,
                                   solution$aggr_strata,
                                   progress=FALSE),
  # Expected CVs with optimal strata
  expected_CV1 = expected_CV(solution$aggr_strata),
  # Adjust allocation with affordable sample size
  sample_size = 150,
  adjustedStrata = adjustSize(size=sample_size,strata=solution$aggr_strata,cens=NULL),
  # New expected CVs
  expected_CV2 = expected_CV(adjustedStrata),
  # Evaluation by simulation
  eval = evalSolution(frame = solution$framenew, 
                     outstrata = adjustedStrata, 
                     nsampl = 500,
                     progress = FALSE), 
  # Final selection of the sample
  sample = selectSample(solution$framenew,adjustedStrata)
)

Execution of the workflow

We have just defined this plan:

print(plan)
#> # A tibble: 13 × 2
#>    target           command                                                                                                                                                                                                                        
#>    <chr>            <expr_lst>                                                                                                                                                                                                                     
#>  1 frame            buildFrameDF(df = swissmun, id = "COM", domainvalue = "REG",      X = c("HApoly", "POPTOT"), Y = c("Surfacesbois", "Airbat"))                                                                                                  
#>  2 cv               as.data.frame(list(DOM = rep("DOM1", length(unique(swissmun$REG))),      CV1 = rep(0.1, length(unique(swissmun$REG))), CV2 = rep(0.1,          length(unique(swissmun$REG))), domainvalue = c(1:length(unique(swissmun$REG)))))
#>  3 kmean            KmeansSolution2(frame = frame, errors = cv, maxclusters = 5)                                                                                                                                                                   
#>  4 nstrat           tapply(kmean$suggestions, kmean$domainvalue, FUN = function(x) length(unique(x)))                                                                                                                                              
#>  5 sugg             prepareSuggestion(kmean = kmean, frame = frame, nstrat = nstrat)                                                                                                                                                               
#>  6 solution         optimStrata(method = "continuous", framesamp = frame, errors = cv,      suggestions = sugg, nStrat = nstrat, iter = 25, pops = 10)                                                                                             
#>  7 strata_structure summaryStrata(solution$framenew, solution$aggr_strata, progress = FALSE)                                                                                                                                                       
#>  8 expected_CV1     expected_CV(solution$aggr_strata)                                                                                                                                                                                              
#>  9 sample_size      150                                                                                                                                                                                                                            
#> 10 adjustedStrata   adjustSize(size = sample_size, strata = solution$aggr_strata,      cens = NULL)                                                                                                                                                
#> 11 expected_CV2     expected_CV(adjustedStrata)                                                                                                                                                                                                    
#> 12 eval             evalSolution(frame = solution$framenew, outstrata = adjustedStrata,      nsampl = 500, progress = FALSE)                                                                                                                       
#> 13 sample           selectSample(solution$framenew, adjustedStrata)

visualised in this way:

vis_drake_graph(plan, ncol_legend = NULL,
                main="Optimization with 'continuous' method")

and now we execute:

make(plan)
#>  All targets are already up to date.

Output inspection

First of all, the total size of the optimized sample:

options(width = 999)
loadd(solution)
sum(solution$aggr_strata$SOLUZ)
#> [1] 87.42433

Then, the structure of the optimized strata:

loadd(strata_structure)
strata_structure
#>   Domain Stratum Population Allocation SamplingRate Lower_X1 Upper_X1 Lower_X2 Upper_X2
#> 1      1       1        346         10     0.030101       32      935       27     2283
#> 2      1       2        125         13     0.102664      185     3031       65     3247
#> 3      1       3         51          5     0.099717      129     2984     3312    29559
#> 4      1       4         64         19     0.291158     3144    24269       78    22454
#> 5      1       5          3          2     0.666667     1593    28225     6536   177964
#> 6      2       1        468          7     0.015805       34      823       30     1322
#> 7      2       2        298         11     0.037152      168     1782       79     4916
#> 8      2       3         93          7     0.069973      152     4182      128     9609
#> 9      2       4         54         13     0.249651      421    20079      272   128634

The expected CVs given the optimized strata:

loadd(expected_CV1)
expected_CV1
#>      cv(Y1) cv(Y2)
#> DOM1  0.099  0.099
#> DOM2  0.100  0.100

and after the adjustment of the allocation in the strata based on the affordable sample size:

loadd(expected_CV2)
expected_CV2
#>      cv(Y1) cv(Y2)
#> DOM1  0.070  0.071
#> DOM2  0.074  0.072

The results of simulation with the adjusted allocation:

loadd(eval)
eval$coeff_var
#>      CV1    CV2  dom
#> 1 0.0670 0.0743 DOM1
#> 2 0.0731 0.0733 DOM2
eval$rel_bias
#>        y1      y2  dom
#> 1 -0.0041  0.0026 DOM1
#> 2 -0.0002 -0.0014 DOM2

Finally, the selection of the sample from the adjusted strata:

loadd(sample)
head(sample)
#>   DOMAINVALUE STRATO   ID  X1  X2  Y1 Y2 LABEL  WEIGHTS        FPC
#> 1           1      1 5500 237 193  24  9     1 19.22222 0.05202312
#> 2           1      1 5533 500 543 184 17     1 19.22222 0.05202312
#> 3           1      1 6178 301  27 124  3     1 19.22222 0.05202312
#> 4           1      1 5935  86  50  13  2     1 19.22222 0.05202312
#> 5           1      1 5514 679 917 131 27     1 19.22222 0.05202312
#> 6           1      1 5905 688 299 176 12     1 19.22222 0.05202312