Estimating and reporting

This chapter uses a large number of packages, and the SAFI data set, so make sure all are downloaded by running the code from the Set-up section.

I will create a table of descriptive statistics, and a simple regression table.

Generating some fake data

First we make a fake intervention aimed at improving fertilizer adoption. Adoption depends on the treatment and education and a random component. The page on DeclareDesign has more advanced techniques to generate fake data.

library(tidyverse)
library(here)
library(modelsummary)
library(flextable)
library(estimatr)
library(lmtest)
library(clubSandwich)

rm(list=ls())
set.seed(1)

data <-  
  read_csv(here("data/SAFI_clean.csv"), na = "NULL") %>%
  left_join({.} %>%
            select(village) %>%
            distinct(village) %>%
            rowwise %>%
            mutate(treatment = rbinom(1,1,0.5)))%>%
  rowwise() %>%
  mutate(educated = rbinom(1,1,0.3),
         u = sample(c(0.1,0.2,0.3),1),
         prob = 0.3 * treatment + 0.1 * educated + u,
         uses_fertilizer = rbinom(1,1,prob)) %>%
  ungroup() %>%
  select(-prob,-u)

Making a table of summary statistics

Using modelsummary and flextable

The modelsummary package is the most convenient to create tables. To convert them to word, I use the flextable package.

For a simple table of descriptive statistics, use the datasummary() function. I also define a vector with variable labels, which I use throughout this chapter. Below, I use it in the labelizor() function, which applies labels to a flextable object. I also apply the autofit() and fix_border_issues() functions to make the table look nicer.

library(modelsummary)
library(flextable)

# vector for labelling variable names
labels = c(no_membrs = "# HH Members",
           years_liv = "Year in village",
           rooms = "# Rooms",
           liv_count = "# Livestock",
           no_meals = "# Meals",
           treatment = "Treated",
           educated = "Educated",
           uses_fertilizer = "Uses fertilizer",
           `(Intercept)` = "Constant")

# descriptive stats
data %>%
  select(where(is.numeric), -ends_with("ID")) %>%
  datasummary(All(.) ~ Mean + SD + min + max + Histogram , data = ., output = "flextable") %>%
  labelizor(j =1,labels = labels, part = "all")%>% 
  fix_border_issues() %>% 
  autofit()

	Mean	SD	min	max	Histogram
# HH Members	7.19	3.17	2.00	19.00	▂▅▇▂▃▂▁
Year in village	23.05	16.91	1.00	96.00	▆▇▆▁▂▁
# Rooms	1.74	1.09	1.00	8.00	▇▃▂
# Livestock	2.37	1.08	1.00	5.00	▆▅▇▂
# Meals	2.60	0.49	2.00	3.00	▅▇
Treated	0.37	0.49	0.00	1.00	▇▄
Educated	0.30	0.46	0.00	1.00	▇▃
Uses fertilizer	0.34	0.48	0.00	1.00	▇▄

Flextables can be easily exported to Word using the save_as_docx() function.

Balance Table

Using modelsummary’s datasummary_balance() table function, it is easy to create a balance table:

treat_labels <- c("0" = "Control", "1" = "Treated")

# balance table
data %>%
  select(where(is.numeric), -ends_with("ID")) %>%
  datasummary_balance( ~ treatment , data = ., 
                      output = "flextable", stars = TRUE, 
                      dinm = TRUE, dinm_statistic = "p.value") %>%
  labelizor(j =1,labels = labels, part = "all")%>%
  labelizor(labels = treat_labels, part = "header")%>% 
  fix_border_issues() %>% 
  autofit()

	Control		Treated
	Mean	Std. Dev.	Mean	Std. Dev.	Diff. in Means	p
# HH Members	7.0	2.9	7.6	3.6	0.6	0.320
Year in village	21.9	14.9	24.9	19.8	3.0	0.366
# Rooms	1.7	1.2	1.7	1.0	-0.0	0.961
# Livestock	2.2	1.0	2.6	1.2	0.3	0.107
# Meals	2.6	0.5	2.6	0.5	0.0	0.594
Educated	0.3	0.5	0.3	0.5	0.0	0.873
Uses fertilizer	0.2	0.4	0.5	0.5	0.3**	0.003

Advanced: Using only Flextable

For more control, the flextable package can convert data frames into good-looking tables using the tabulator() function.

First, make a data frame with summary statistics. I duplicate the data set using bind_rows() to create an overall group. Then I use summarize(across(...)) to apply summarizing functionsto a number of variables.

summstats <-
  bind_rows(data %>% 
              mutate(Treatment = ifelse(treatment,
                                        " Treatment",
                                        "  Control")),
            data %>% 
              mutate(Treatment = "Overall")) %>% 
  mutate(Treatment = factor(Treatment)) %>%
  select(where(is.numeric),Treatment,-key_ID,-treatment) %>%
  group_by(Treatment) %>%
  summarize(across(.cols = everything(),
                   .fns = list(n =  ~sum(!is.na(.x)),
                               nmiss =  ~sum(is.na(.x)),
                               mean = ~mean(.x,na.rm=TRUE),
                               sd = ~sd(.x,na.rm=TRUE),
                               min =  ~min(.x,na.rm=TRUE),
                               max =  ~max(.x,na.rm=TRUE),
                               iqr =  ~IQR(.x,na.rm=TRUE)),
                      .names =  "{.col}-{.fn}")) %>%
  pivot_longer(cols = -Treatment,
                 names_to = c("Variable",".value"),
                 names_sep="-") 



summstats

## # A tibble: 21 × 9
##    Treatment    Variable            n nmiss   mean     sd   min   max   iqr
##    <fct>        <chr>           <int> <int>  <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 "  Control"  no_membrs          82     0  6.96   2.86      2    15  3.75
##  2 "  Control"  years_liv          82     0 21.9   14.9       1    70 13.8 
##  3 "  Control"  rooms              82     0  1.74   1.17      1     8  1   
##  4 "  Control"  liv_count          82     0  2.24   1.01      1     4  2   
##  5 "  Control"  no_meals           82     0  2.59   0.496     2     3  1   
##  6 "  Control"  educated           82     0  0.293  0.458     0     1  1   
##  7 "  Control"  uses_fertilizer    82     0  0.244  0.432     0     1  0   
##  8 " Treatment" no_membrs          49     0  7.57   3.64      2    19  5   
##  9 " Treatment" years_liv          49     0 24.9   19.8       2    96 22   
## 10 " Treatment" rooms              49     0  1.73   0.953     1     4  1   
## # ℹ 11 more rows

Then use I flextable’s tabulator() to make output that looks good in word. Note that tabulator() sorts the columns alphabetically, so that would be control - overall - treatment. That doesn’t make sense, so I have used spaces (" Treatment") to control the ordering.

I’ve added a bunch of statistics to show the flexibility:

library(flextable)

summstats %>%
  tabulator(rows = "Variable",
          columns = "Treatment",
          `N` = as_paragraph(as_chunk(n,digits=0)),
          `Mean (SD)` =  as_paragraph(as_chunk(fmt_avg_dev(mean, 
                                      sd, digit1=2,digit2 = 2))),
          Range = as_paragraph(as_chunk(min,digits=3), "-",as_chunk(max,digits=3)) ) %>%
  as_flextable()

Variable	Control			Treatment			Overall
Variable	N	Mean (SD)	Range	N	Mean (SD)	Range	N	Mean (SD)	Range
educated	82	0.29 (0.46)	0.000-1.000	49	0.31 (0.47)	0.000-1.000	131	0.30 (0.46)	0.000-1.000
liv_count	82	2.24 (1.01)	1.000-4.000	49	2.57 (1.17)	1.000-5.000	131	2.37 (1.08)	1.000-5.000
no_meals	82	2.59 (0.50)	2.000-3.000	49	2.63 (0.49)	2.000-3.000	131	2.60 (0.49)	2.000-3.000
no_membrs	82	6.96 (2.86)	2.000-15.000	49	7.57 (3.64)	2.000-19.000	131	7.19 (3.17)	2.000-19.000
rooms	82	1.74 (1.17)	1.000-8.000	49	1.73 (0.95)	1.000-4.000	131	1.74 (1.09)	1.000-8.000
uses_fertilizer	82	0.24 (0.43)	0.000-1.000	49	0.51 (0.51)	0.000-1.000	131	0.34 (0.48)	0.000-1.000
years_liv	82	21.94 (14.92)	1.000-70.000	49	24.92 (19.83)	2.000-96.000	131	23.05 (16.91)	1.000-96.000

To add a column with differences, I first define a function to compute the differences (I use a regression rather than a ttest, so I can cluster my standard errors etc. to this if I need to). Then I use summarize(across(...)) in much the same way as above, now to create a dataframe called difcol.

library(estimatr)

get_diffs <- function(equation, clusters = NULL, weights = NULL, std.error = FALSE){

  reg <-  
    lm_robust(equation, clusters = {{ clusters }}, weights = {{ weights }}) %>% 
    broom::tidy()

  estimate <- round(reg[2,2],2)

  p <- reg[2,5]

  stars = case_when(p < 0.001 ~ "***",
                    p < 0.01 ~ "**",
                    p < 0.05 ~ "*",
                    .default = "" )

    paste0(estimate,stars)
  
}

difcol <-
  data %>%
  select(where(is.numeric),-key_ID,treatment,village) %>%
  summarize(across(.cols = c(everything(), -treatment, -village),
                   .fns = ~get_diffs(.x ~ treatment, clusters = village))) %>% 
  pivot_longer(cols =everything(),
               names_to = "Variable",
               values_to="Difference")

Then, all I have to do is add it to tabulator() using its datasup_last argument. Below, I also use a few other flextable function to make the table nicer. In particular, labelizor() to add variable labels, for which I use the named vector I defined above.

descriptive_table_flex <-
  summstats %>%
  tabulator(rows = "Variable",
            columns = "Treatment",
            datasup_last = difcol,
            `N` = as_paragraph(as_chunk(n,digits=0)),
            `Mean (SD)` =  as_paragraph(as_chunk(fmt_avg_dev(mean, sd, digit1=2,digit2 = 2)))) %>%
  as_flextable()  %>%
  labelizor(j = "Variable", labels = labels, part = "body") %>% 
  fix_border_issues() %>% 
  autofit()

descriptive_table_flex

Variable	Control		Treatment		Overall		Difference
Variable	N	Mean (SD)	N	Mean (SD)	N	Mean (SD)	Difference
Educated	82	0.29 (0.46)	49	0.31 (0.47)	131	0.30 (0.46)	0.01
# Livestock	82	2.24 (1.01)	49	2.57 (1.17)	131	2.37 (1.08)	0.33*
# Meals	82	2.59 (0.50)	49	2.63 (0.49)	131	2.60 (0.49)	0.05
# HH Members	82	6.96 (2.86)	49	7.57 (3.64)	131	7.19 (3.17)	0.61
# Rooms	82	1.74 (1.17)	49	1.73 (0.95)	131	1.74 (1.09)	-0.01
Uses fertilizer	82	0.24 (0.43)	49	0.51 (0.51)	131	0.34 (0.48)	0.27
Year in village	82	21.94 (14.92)	49	24.92 (19.83)	131	23.05 (16.91)	2.98

Again, to save it as a word file, use save_as_docx(path = "my/file.docx").

Simple regression

A simple regression uses the lm() function. I use the modelsummary() function to display it:

lm <- lm(uses_fertilizer ~ treatment + educated, data = data)
modelsummary(lm, output = "flextable")

	(1)
(Intercept)	0.207
	(0.057)
treatment	0.265
	(0.083)
educated	0.128
	(0.088)
Num.Obs.	131
R2	0.089
R2 Adj.	0.074
AIC	172.5
BIC	184.0
Log.Lik.	-82.241
F	6.230
RMSE	0.45

Robust standard errors

To get robust standard errors clustered at the village level, using the same procedures Stata uses, I use
lm_robust():

library(estimatr)
lmrobust <-
  lm_robust(uses_fertilizer ~ treatment + educated, 
            data = data, clusters = village, se_type = "CR2")


modelsummary(list("LM" = lm, "Robust" = lmrobust), output = "flextable")

	LM	Robust
(Intercept)	0.207	0.207
	(0.057)	(0.082)
treatment	0.265	0.265
	(0.083)	(0.105)
educated	0.128	0.128
	(0.088)	(0.101)
Num.Obs.	131	131
R2	0.089	0.089
R2 Adj.	0.074	0.074
AIC	172.5	172.5
BIC	184.0	184.0
Log.Lik.	-82.241
F	6.230
RMSE	0.45	0.45
Std.Errors		by: village

I’d like to have just the N, r-squared and Adjusted R-squared:

modelsummary(list("LM" = lm, "Robust" = lmrobust), 
             gof_map = c("nobs","r.squared","adj.r.squared"),
             output = "flextable")

	LM	Robust
(Intercept)	0.207	0.207
	(0.057)	(0.082)
treatment	0.265	0.265
	(0.083)	(0.105)
educated	0.128	0.128
	(0.088)	(0.101)
Num.Obs.	131	131
R2	0.089	0.089
R2 Adj.	0.074	0.074

If you want to add the number of clusters, you will need to do some work. modelsummary() gets the N etc. from the broom::glance() function. For lm_robust() models, this doesn’t report the number of clusters. However, you can make sure that it does, by making a custom glance methods for lm_robust objects (see here for details on how to make use custom glance methods in modelsumary):

glance_custom.lm_robust <- function(x) {
  # this function takes glance() output, and adds a nclusters column
  glance(x) %>%
  mutate(nclusters = x$nclusters)
}

modelsummary(list("LM" = lm, "Robust" = lmrobust), 
             gof_map = c("nobs","r.squared","adj.r.squared", "nclusters"),
             output = "flextable")

	LM	Robust
(Intercept)	0.207	0.207
	(0.057)	(0.082)
treatment	0.265	0.265
	(0.083)	(0.105)
educated	0.128	0.128
	(0.088)	(0.101)
Num.Obs.	131	131
R2	0.089	0.089
R2 Adj.	0.074	0.074
Num.Clust.		3

Now lets add a probit model!

probit <- glm(uses_fertilizer ~ treatment + educated, 
                family = binomial(link = "probit"), 
                data = data)

modelsummary(list("LM" = lm, "Robust" = lmrobust, "Probit"  = probit), 
             gof_map = c("nobs","r.squared","adj.r.squared", "nclusters"),
             output = "flextable")

	LM	Robust	Probit
(Intercept)	0.207	0.207	-0.813
	(0.057)	(0.082)	(0.173)
treatment	0.265	0.265	0.727
	(0.083)	(0.105)	(0.236)
educated	0.128	0.128	0.366
	(0.088)	(0.101)	(0.250)
Num.Obs.	131	131	131
R2	0.089	0.089
R2 Adj.	0.074	0.074
Num.Clust.		3

Adding cluster-robust standard errors to the probit model is a bit more complex. There is no glm_robust() function. You can estimate a probit model, then use coeftest() to get clustered standard errors, and then use the glance_custom method to add the number of clusters to that.

However, you can create your own glm_robust function, ~~with blackjack and hookers~~ including tidy() and glance() methods that return everything you want to modelsummary(). I use vcovCR from the clubSandwich package to compute CR2 standard errors, which are the default in estimatr, so everything is nice and consistent with the linear probability model.

# this function estimates a probit model, and 
# then computes the cluster-robust standad errors using
# clubSandwich and coeftest
# it returns a glm_robust object, which is just a modified 
# coeftest object
glm_robust <- function(formula,family,data,cluster, type = "CR2") {
  library(lmtest)
  library(clubSandwich)
 
  probit <- 
    glm(formula, 
        family = family, 
        data = data)

  model <-
    probit%>%
    coeftest(.,
             vcovCR({.}, 
                    cluster = data[[cluster]], 
                    type = type), 
              save = TRUE) 


  # this computes the number of unique clusters in the data used
  # for the original mode
  attr(model,"nclusters") <- 
    data[row.names(model.frame(probit)),cluster] %>% unique() %>% nrow()

  class(model) <- "glm_robust"
  model
}

# this is the custom tidy methods for glm_robust objects
# it returns a dataframe with coefficients
tidy.glm_robust <- function(x, ...){
  x[,] %>% as_tibble() %>%
  mutate(term = attr(x,"dimnames")[[1]]) %>%
  select(term,
         estimate = Estimate,
         std.error = `Std. Error`,
         statistic = `z value`,
         p.value = `Pr(>|z|)`) 
}


# this is the glance method. It returns a data frame with 
# the number of obserations, log likelihood and number of clusters
glance.glm_robust <- function(x, ...){
  tibble(nobs = attr(x,"nobs"),
         logLik = as.numeric(attr(x,"logLik")),
        nclusters = attr(x,"nclusters")
        )
}

You can then simply use the glm_robust() function and modelsummary() will know how to handle its output!

probitrobust <- 
  glm_robust(uses_fertilizer ~ treatment + educated, 
             family = binomial(link = "probit"), 
             data = data, cluster="village", type = "CR2")

modelsummary(list("LM" = lm, "LM Robust" = lmrobust, 
                  "Probit"  = probit, "Probit Robust" = probitrobust), 
             gof_map = c("nobs","r.squared","adj.r.squared", "nclusters"),
             coef_map = labels,
             stars = TRUE,
             output = "flextable") %>%
  autofit()

	LM	LM Robust	Probit	Probit Robust
Treated	0.265**	0.265*	0.727**	0.727*
	(0.083)	(0.105)	(0.236)	(0.331)
Educated	0.128	0.128	0.366	0.366
	(0.088)	(0.101)	(0.250)	(0.273)
Constant	0.207***	0.207*	-0.813***	-0.813**
	(0.057)	(0.082)	(0.173)	(0.279)
Num.Obs.	131	131	131	131
R2	0.089	0.089
R2 Adj.	0.074	0.074
Num.Clust.		3		3
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001