Format of the crosstabs’ data

In this vignette we’ll show the underlying data format of the crosstabs generated by crosstabser. Basically, it’s a long format of tidy data such as the one needed if you’d want to plot the crosstab with ggplot2 or observable plot (like in table_charter; see here for an interactive demo). However, in order to reduce redundancy and to save space we store the data in multiple data.frames that can be merged together in the end. First, let’s load the needed libraries:

library(crosstabser)
library(dplyr)
library(ggplot2)
library(purrr)
library(tidyr)
library(haven)

and define a labelled data.frame:

df <- tibble::tibble(
  q1 = c(1, 2, 1) |> haven::labelled(c(Yes = 1, No = 2), label = "Super important question"),
  age = c(2, 1, 1) |> haven::labelled(c("18-39" = 1, "40+" = 2), label = "age")
)
df
#> # A tibble: 3 × 2
#>   q1        age      
#>   <dbl+lbl> <dbl+lbl>
#> 1 1 [Yes]   2 [40+]  
#> 2 2 [No]    1 [18-39]
#> 3 1 [Yes]   1 [18-39]

Next we define the syntax to generate a crosstab:

Questions <- tibble::tibble(
  Type  = "cat",
  RowVar = "q1",
  Title = "The crosstab's title"
)

We’ll use the age variable for the x-axis of the crosstab:

ColVar <- "age"

Now we can construct an R6 object of class “Tabula”:

mapping_file <- list(Questions = Questions, Macro = list(ColVar = ColVar))
m <- Tabula$new(
  df,
  mapping_file,
)

The Tabula$get_crosstabs_data() method returns a list of dataframes containing the crosstabs’ underlying data:

l <- m$get_crosstabs_data()
l
#> $tab_table
#> # A tibble: 1 × 15
#>      BookNo QuestNo TabName QuestLine TabNo TabType TabTitle TabTitle1 TabTitle2
#>       <dbl> <chr>   <chr>       <dbl> <int> <chr>   <chr>    <chr>     <chr>    
#> 1 999999999 _row_2  CAT#_r…         2     1 CAT     The cro… The cros… The cros…
#> # ℹ 6 more variables: TabTitle3 <chr>, TabCaption <chr>, SelVal <chr>,
#> #   repov_name <chr>, TabCount <int>, TabRowTypes <int>
#> 
#> $val_table
#> # A tibble: 21 × 6
#>       BookNo QuestNo TabNo RowNo ColNo Value
#>        <dbl> <chr>   <int> <int> <int> <dbl>
#>  1 999999999 _row_2      1     4     4   3  
#>  2 999999999 _row_2      1     4     5   2  
#>  3 999999999 _row_2      1     4     6   1  
#>  4 999999999 _row_2      1     5     4   2  
#>  5 999999999 _row_2      1     5     5   1  
#>  6 999999999 _row_2      1     5     6   1  
#>  7 999999999 _row_2      1     6     4  66.7
#>  8 999999999 _row_2      1     6     5  50  
#>  9 999999999 _row_2      1     6     6 100  
#> 10 999999999 _row_2      1     7     4   1  
#> # ℹ 11 more rows
#> 
#> $row_table
#> # A tibble: 11 × 18
#>      BookNo RowNo RowContent RowAbsPercent RowWeighted TabNo RowTitle1 RowTitle2
#>       <dbl> <int> <chr>      <chr>         <chr>       <int> <chr>     <chr>    
#>  1   1.00e9     1 Title      ""            ""              1 "The cro… ""       
#>  2   1.00e9     2 Header     ""            ""              1  <NA>     ""       
#>  3   1.00e9     3 Header     ""            ""              1  <NA>     ""       
#>  4   1.00e9     4 Total      "Abs"         "Unweighte…     1 "TOTAL"   "TOTAL"  
#>  5   1.00e9     5 Detail     "Abs"         "Unweighte…     1 "Yes"     "Yes"    
#>  6   1.00e9     6 Detail     "Percent"     "Unweighte…     1 "Yes"     "Yes"    
#>  7   1.00e9     7 Detail     "Abs"         "Unweighte…     1 "No"      "No"     
#>  8   1.00e9     8 Detail     "Percent"     "Unweighte…     1 "No"      "No"     
#>  9   1.00e9     9 Valid      "Abs"         "Unweighte…     1 "VALID C… "VALID C…
#> 10   1.00e9    10 Valid      "Percent"     "Unweighte…     1 "VALID C… "VALID C…
#> 11   1.00e9    11 Empty      ""            ""              1 ""        ""       
#> # ℹ 10 more variables: RowTitle3 <chr>, RowFormat <chr>, RowDecimals <int>,
#> #   RowVariable <chr>, RowValue <dbl>, row_type <chr>, QuestNo <chr>,
#> #   RowTypeS <chr>, RowType <int>, RowContentDetail <chr>
#> 
#> $head_table
#> # A tibble: 5 × 5
#>      BookNo HeadNo HeadName     HeadTitle HeadCount
#>       <dbl>  <int> <chr>        <chr>         <int>
#> 1 999999999      1 DC#ROWHEADER <NA>              3
#> 2 999999999      2 DC#TOTAL     TOTAL             1
#> 3 999999999      3 age@1        age               2
#> 4 999999999      4 DC#EMPTY     <NA>              1
#> 5 999999999      5 DC#TITLE     <NA>              1
#> 
#> $col_table_all
#>   BookNo ColNo HeadNo ColTitle1 ColTitle2  ColVariable ColValue
#> 1  1e+09     1      1                     DC#ROWHEADER       NA
#> 2  1e+09     2      1                     DC#ROWHEADER       NA
#> 3  1e+09     3      1                     DC#ROWHEADER       NA
#> 4  1e+09     4      2     TOTAL               DC#TOTAL        1
#> 5  1e+09     5      3       age     18-39          age        1
#> 6  1e+09     6      3       age       40+          age        2
#> 7  1e+09     7      4                         DC#EMPTY       NA
#> 8  1e+09     8      5                         DC#TITLE       NA

It contains 5 data.frames:

Now we’re ready to merge all this data into one data.frame:

res <- l[c(
  "tab_table", 
  "val_table", 
  "row_table", 
  "col_table_all",
  "head_table"
)] |> 
  reduce(left_join)
#> Joining with `by = join_by(BookNo, QuestNo, TabNo)`
#> Joining with `by = join_by(BookNo, QuestNo, TabNo, RowNo)`
#> Joining with `by = join_by(BookNo, ColNo)`
#> Joining with `by = join_by(BookNo, HeadNo)`
Click here to see the full data.frame
knitr::kable(res)
BookNo QuestNo TabName QuestLine TabNo TabType TabTitle TabTitle1 TabTitle2 TabTitle3 TabCaption SelVal repov_name TabCount TabRowTypes RowNo ColNo Value RowContent RowAbsPercent RowWeighted RowTitle1 RowTitle2 RowTitle3 RowFormat RowDecimals RowVariable RowValue row_type RowTypeS RowType RowContentDetail HeadNo ColTitle1 ColTitle2 ColVariable ColValue HeadName HeadTitle HeadCount
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 4 4 3.00000 Total Abs Unweighted TOTAL TOTAL abs NA 0 q1 1 total Total|AbsUnweighted 2097408 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 4 5 2.00000 Total Abs Unweighted TOTAL TOTAL abs NA 0 q1 1 total Total|AbsUnweighted 2097408 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 4 6 1.00000 Total Abs Unweighted TOTAL TOTAL abs NA 0 q1 1 total Total|AbsUnweighted 2097408 3 age 40+ age 2 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 5 4 2.00000 Detail Abs Unweighted Yes Yes abs NA 0 q1 1 detail_freqs_valid Detail|AbsUnweighted 2097168 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 5 5 1.00000 Detail Abs Unweighted Yes Yes abs NA 0 q1 1 detail_freqs_valid Detail|AbsUnweighted 2097168 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 5 6 1.00000 Detail Abs Unweighted Yes Yes abs NA 0 q1 1 detail_freqs_valid Detail|AbsUnweighted 2097168 3 age 40+ age 2 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 6 4 66.66667 Detail Percent Unweighted Yes Yes in % NA 1 q1 1 detail_perc_valid Detail|PercentUnweighted 33554448 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 6 5 50.00000 Detail Percent Unweighted Yes Yes in % NA 1 q1 1 detail_perc_valid Detail|PercentUnweighted 33554448 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 6 6 100.00000 Detail Percent Unweighted Yes Yes in % NA 1 q1 1 detail_perc_valid Detail|PercentUnweighted 33554448 3 age 40+ age 2 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 7 4 1.00000 Detail Abs Unweighted No No abs NA 0 q1 2 detail_freqs_valid Detail|AbsUnweighted 2097168 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 7 5 1.00000 Detail Abs Unweighted No No abs NA 0 q1 2 detail_freqs_valid Detail|AbsUnweighted 2097168 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 7 6 0.00000 Detail Abs Unweighted No No abs NA 0 q1 2 detail_freqs_valid Detail|AbsUnweighted 2097168 3 age 40+ age 2 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 8 4 33.33333 Detail Percent Unweighted No No in % NA 1 q1 2 detail_perc_valid Detail|PercentUnweighted 33554448 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 8 5 50.00000 Detail Percent Unweighted No No in % NA 1 q1 2 detail_perc_valid Detail|PercentUnweighted 33554448 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 8 6 0.00000 Detail Percent Unweighted No No in % NA 1 q1 2 detail_perc_valid Detail|PercentUnweighted 33554448 3 age 40+ age 2 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 9 4 3.00000 Valid Abs Unweighted VALID CASES VALID CASES abs NA 0 q1 1 n_valid_freqs Valid|AbsUnweighted 2097664 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 9 5 2.00000 Valid Abs Unweighted VALID CASES VALID CASES abs NA 0 q1 1 n_valid_freqs Valid|AbsUnweighted 2097664 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 9 6 1.00000 Valid Abs Unweighted VALID CASES VALID CASES abs NA 0 q1 1 n_valid_freqs Valid|AbsUnweighted 2097664 3 age 40+ age 2 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 10 4 100.00000 Valid Percent Unweighted VALID CASES VALID CASES in % NA 1 q1 1 n_valid_perc Valid|PercentUnweighted 33554944 2 TOTAL DC#TOTAL 1 DC#TOTAL TOTAL 1
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 10 5 100.00000 Valid Percent Unweighted VALID CASES VALID CASES in % NA 1 q1 1 n_valid_perc Valid|PercentUnweighted 33554944 3 age 18-39 age 1 age 2
1e+09 _row_2 2 1 CAT The crosstab’s title The crosstab’s title The crosstab’s title The crosstab’s title NA NA NA 11 NA 10 6 100.00000 Valid Percent Unweighted VALID CASES VALID CASES in % NA 1 q1 1 n_valid_perc Valid|PercentUnweighted 33554944 3 age 40+ age 2 age 2

If we look at the crosstab

m
#> $`2`
#> $`2`[[1]]
#> # The crosstab's title
#>                            TOTAL age   -----
#>                                  18-39 40+  
#> TOTAL                abs     3       2     1
#> Yes                  abs     2       1     1
#>                      in %   66.7    50   100
#> No                   abs     1       1     0
#>                      in %   33.3    50     0
#> VALID CASES          abs     3       2     1
#>                      in %  100     100   100

and say we wanted to generated a color-coded raster of the percent values, we could do this like this:

res |> 
  filter(
    # This removes the data of the "TOTAL" & "VALID CASES" rows:
    RowContent == "Detail", 
    # remove rows with absolute values:
    RowAbsPercent == "Percent"
  ) |> 
  ggplot() +
  geom_tile(aes(
    x = ColTitle2,
    y = RowTitle1,
    fill = Value
  )) + 
  facet_grid(
    ~ as_factor(ColTitle1), 
    scales = "free_x"
  ) +
  scale_x_discrete(position = "top") +
  theme_minimal() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  )

(This plot isn’t really interesting; this is just to demonstrate the structure of the crosstabs’ underlying data)