Estimating and reporting

This chapter uses a large number of packages, and the SAFI data set, so make sure all are downloaded by running the code from the Set-up section.

I will create a table of descriptive statistics, and a simple regression table.

Generating some fake data

First we make a fake intervention aimed at improving fertilizer adoption. Adoption depends on the treatment and education and a random component. The page on DeclareDesign has more advanced techniques to generate fake data.

library(tidyverse)
library(here)
library(modelsummary)
library(flextable)
library(estimatr)
library(lmtest)
library(clubSandwich)

rm(list=ls())
set.seed(1)

data <-  
  read_csv(here("data/SAFI_clean.csv"), na = "NULL") %>%
  left_join({.} %>%
            select(village) %>%
            distinct(village) %>%
            rowwise %>%
            mutate(treatment = rbinom(1,1,0.5)))%>%
  rowwise() %>%
  mutate(educated = rbinom(1,1,0.3),
         u = sample(c(0.1,0.2,0.3),1),
         prob = 0.3 * treatment + 0.1 * educated + u,
         uses_fertilizer = rbinom(1,1,prob)) %>%
  ungroup() %>%
  select(-prob,-u) 

Making a table of summary statistics

Using modelsummary and flextable

The modelsummary package is the most convenient to create tables. To convert them to word, I use the flextable package.

For a simple table of descriptive statistics, use the datasummary() function. I also define a vector with variable labels, which I use throughout this chapter. Below, I use it in the labelizor() function, which applies labels to a flextable object. I also apply the autofit() and fix_border_issues() functions to make the table look nicer.

library(modelsummary)
library(flextable)

# vector for labelling variable names
labels = c(no_membrs = "# HH Members",
           years_liv = "Year in village",
           rooms = "# Rooms",
           liv_count = "# Livestock",
           no_meals = "# Meals",
           treatment = "Treated",
           educated = "Educated",
           uses_fertilizer = "Uses fertilizer",
           `(Intercept)` = "Constant")

# descriptive stats
data %>%
  select(where(is.numeric), -ends_with("ID")) %>%
  datasummary(All(.) ~ Mean + SD + min + max + Histogram , data = ., output = "flextable") %>%
  labelizor(j =1,labels = labels, part = "all")%>% 
  fix_border_issues() %>% 
  autofit()

Mean

SD

min

max

Histogram

# HH Members

7.19

3.17

2.00

19.00

▂▅▇▂▃▂▁

Year in village

23.05

16.91

1.00

96.00

▆▇▆▁▂▁

# Rooms

1.74

1.09

1.00

8.00

▇▃▂

# Livestock

2.37

1.08

1.00

5.00

▆▅▇▂

# Meals

2.60

0.49

2.00

3.00

▅▇

Treated

0.37

0.49

0.00

1.00

▇▄

Educated

0.30

0.46

0.00

1.00

▇▃

Uses fertilizer

0.34

0.48

0.00

1.00

▇▄

Flextables can be easily exported to Word using the save_as_docx() function.

Balance Table

Using modelsummary’s datasummary_balance() table function, it is easy to create a balance table:

treat_labels <- c("0" = "Control", "1" = "Treated")

# balance table
data %>%
  select(where(is.numeric), -ends_with("ID")) %>%
  datasummary_balance( ~ treatment , data = ., 
                      output = "flextable", stars = TRUE, 
                      dinm = TRUE, dinm_statistic = "p.value") %>%
  labelizor(j =1,labels = labels, part = "all")%>%
  labelizor(labels = treat_labels, part = "header")%>% 
  fix_border_issues() %>% 
  autofit()

Control

Treated

Mean

Std. Dev.

Mean

Std. Dev.

Diff. in Means

p

# HH Members

7.0

2.9

7.6

3.6

0.6

0.320

Year in village

21.9

14.9

24.9

19.8

3.0

0.366

# Rooms

1.7

1.2

1.7

1.0

0.0

0.961

# Livestock

2.2

1.0

2.6

1.2

0.3

0.107

# Meals

2.6

0.5

2.6

0.5

0.0

0.594

Educated

0.3

0.5

0.3

0.5

0.0

0.873

Uses fertilizer

0.2

0.4

0.5

0.5

0.3**

0.003

Advanced: Using only Flextable

For more control, the flextable package can covert data frames into good-looking table using the tabulator() function.

First, make a data frame with summary statistics. I duplicate the data set using bind_rows() to create an overall group. Then I use summarize(across(...)) to apply summarizing functions to a number of variables.

summstats <-
  bind_rows(data %>% 
              mutate(Treatment = ifelse(treatment,
                                        " Treatment",
                                        "  Control")),
            data %>% 
              mutate(Treatment = "Overall")) %>% 
  select(where(is.numeric),Treatment,-key_ID,-treatment) %>%
  group_by(Treatment) %>%
  summarize(across(.cols = everything(),
                   .fns = list(n =  ~sum(!is.na(.x)),
                               nmiss =  ~sum(is.na(.x)),
                               mean = ~mean(.x,na.rm=TRUE),
                               sd = ~sd(.x,na.rm=TRUE),
                               min =  ~min(.x,na.rm=TRUE),
                               max =  ~max(.x,na.rm=TRUE),
                               iqr =  ~IQR(.x,na.rm=TRUE)),
                      .names =  "{.col}-{.fn}")) %>%
  pivot_longer(cols = -Treatment,
                 names_to = c("Variable",".value"),
                 names_sep="-") 

summstats
## # A tibble: 21 × 9
##    Treatment    Variable            n nmiss   mean     sd   min   max   iqr
##    <chr>        <chr>           <int> <int>  <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 "  Control"  no_membrs          82     0  6.96   2.86      2    15  3.75
##  2 "  Control"  years_liv          82     0 21.9   14.9       1    70 13.8 
##  3 "  Control"  rooms              82     0  1.74   1.17      1     8  1   
##  4 "  Control"  liv_count          82     0  2.24   1.01      1     4  2   
##  5 "  Control"  no_meals           82     0  2.59   0.496     2     3  1   
##  6 "  Control"  educated           82     0  0.293  0.458     0     1  1   
##  7 "  Control"  uses_fertilizer    82     0  0.244  0.432     0     1  0   
##  8 " Treatment" no_membrs          49     0  7.57   3.64      2    19  5   
##  9 " Treatment" years_liv          49     0 24.9   19.8       2    96 22   
## 10 " Treatment" rooms              49     0  1.73   0.953     1     4  1   
## # ℹ 11 more rows

Then use I flextable’s tabulator() to make output that looks good in word. Note that tabulator() sorts the columns alphabetically, so that would be control, overall, treatment. That doesn’t make sense, so I have used spaces (" Treatment") to control the ordering.

I’ve added a bunch of statistics to show the flexibility:

library(flextable)

summstats %>%
tabulator(rows = "Variable",
          columns = "Treatment",
          `N` = as_paragraph(as_chunk(n,digits=0)),
          `Mean (SD)` =  as_paragraph(as_chunk(fmt_avg_dev(mean, 
                                      sd, digit1=2,digit2 = 2))),
          Range = as_paragraph(as_chunk(min), "-",as_chunk(max)) ) %>%
as_flextable() 

Variable

Control

Treatment

Overall

N

Mean (SD)

Range

N

Mean (SD)

Range

N

Mean (SD)

Range

educated

82

0.29 (0.46)

0.0-1.0

49

0.31 (0.47)

0.0-1.0

131

0.30 (0.46)

0.0-1.0

liv_count

82

2.24 (1.01)

1.0-4.0

49

2.57 (1.17)

1.0-5.0

131

2.37 (1.08)

1.0-5.0

no_meals

82

2.59 (0.50)

2.0-3.0

49

2.63 (0.49)

2.0-3.0

131

2.60 (0.49)

2.0-3.0

no_membrs

82

6.96 (2.86)

2.0-15.0

49

7.57 (3.64)

2.0-19.0

131

7.19 (3.17)

2.0-19.0

rooms

82

1.74 (1.17)

1.0-8.0

49

1.73 (0.95)

1.0-4.0

131

1.74 (1.09)

1.0-8.0

uses_fertilizer

82

0.24 (0.43)

0.0-1.0

49

0.51 (0.51)

0.0-1.0

131

0.34 (0.48)

0.0-1.0

years_liv

82

21.94 (14.92)

1.0-70.0

49

24.92 (19.83)

2.0-96.0

131

23.05 (16.91)

1.0-96.0

To add a column with differences, I first define a function to compute the differences (I use a regression rather than a ttest, so I can cluster my standard errors etc. to this if I need to). Then I use summarize(across(...)) in much the same way as above, now to create a dataframe called difcol.

get_diffs <- function(var,by){

  reg <-  
    lm(var ~ by) %>% 
    broom::tidy()

  coeff = round(reg[2,2],2)
  p <- reg[2,5]

  stars = case_when(p < 0.001 ~ "***",
                    p < 0.01 ~ "**",
                    p < 0.05 ~ "*",
                    .default = "" )

  paste0(coeff,stars)
}

difcol <-
  data %>%
  select(where(is.numeric),-key_ID,treatment) %>%
  summarize(across(.cols = c(everything(), -treatment),
                   .fns = ~get_diffs(var = .x, by = treatment))) %>%
  pivot_longer(cols =everything(),
               names_to = "Variable",
               values_to="Difference")  

Then, all I have to do is add it to tabulator() using its datasup_last argument. Below, I also use a few other flextable function to make the table nicer. In particular, labelizor() to add variable labels, for which I use the named vector I defined above.

descriptive_table_flex <-
  summstats %>%
  tabulator(rows = "Variable",
            columns = "Treatment",
            datasup_last = difcol,
            `N` = as_paragraph(as_chunk(n,digits=0)),
            `Mean (SD)` =  as_paragraph(as_chunk(fmt_avg_dev(mean, sd, digit1=2,digit2 = 2)))) %>%
  as_flextable()  %>%
  labelizor(j = "Variable", labels = labels, part = "all") %>% 
  fix_border_issues() %>% 
  autofit()

descriptive_table_flex

Variable

Control

Treatment

Overall

Difference

N

Mean (SD)

N

Mean (SD)

N

Mean (SD)

Educated

82

0.29 (0.46)

49

0.31 (0.47)

131

0.30 (0.46)

0.01

# Livestock

82

2.24 (1.01)

49

2.57 (1.17)

131

2.37 (1.08)

0.33

# Meals

82

2.59 (0.50)

49

2.63 (0.49)

131

2.60 (0.49)

0.05

# HH Members

82

6.96 (2.86)

49

7.57 (3.64)

131

7.19 (3.17)

0.61

# Rooms

82

1.74 (1.17)

49

1.73 (0.95)

131

1.74 (1.09)

-0.01

Uses fertilizer

82

0.24 (0.43)

49

0.51 (0.51)

131

0.34 (0.48)

0.27**

Year in village

82

21.94 (14.92)

49

24.92 (19.83)

131

23.05 (16.91)

2.98

Again, to save it as a word file, use save_as_docx(path = "my/file.docx").

Simple regression

A simple regression uses the lm() function. I use the modelsummary() function to display it:

lm <- lm(uses_fertilizer ~ treatment + educated, data = data)
modelsummary(lm, output = "flextable")

(1)

(Intercept)

0.207

(0.057)

treatment

0.265

(0.083)

educated

0.128

(0.088)

Num.Obs.

131

R2

0.089

R2 Adj.

0.074

AIC

172.5

BIC

184.0

Log.Lik.

-82.241

F

6.230

RMSE

0.45

Robust standard errors

To get robust standard errors clustered at the village level, using the same procedures Stata uses, I use
lm_robust():

library(estimatr)
lmrobust <-
  lm_robust(uses_fertilizer ~ treatment + educated, 
            data = data, clusters = village, se_type = "CR2")


modelsummary(list("LM" = lm, "Robust" = lmrobust), output = "flextable")

LM

Robust

(Intercept)

0.207

0.207

(0.057)

(0.082)

treatment

0.265

0.265

(0.083)

(0.105)

educated

0.128

0.128

(0.088)

(0.101)

Num.Obs.

131

131

R2

0.089

0.089

R2 Adj.

0.074

0.074

AIC

172.5

172.5

BIC

184.0

184.0

Log.Lik.

-82.241

F

6.230

RMSE

0.45

0.45

Std.Errors

by: village

I’d like to have just the N, r-squared and Adjusted R-squared:

modelsummary(list("LM" = lm, "Robust" = lmrobust), 
             gof_map = c("nobs","r.squared","adj.r.squared"),
             output = "flextable")

LM

Robust

(Intercept)

0.207

0.207

(0.057)

(0.082)

treatment

0.265

0.265

(0.083)

(0.105)

educated

0.128

0.128

(0.088)

(0.101)

Num.Obs.

131

131

R2

0.089

0.089

R2 Adj.

0.074

0.074

If you want to add the number of clusters, you will need to do some work. modelsummary() gets the N etc. from the broom::glance() function. For lm_robust() models, this doesn’t report the number of clusters. However, you can make sure that it does, by making a custom glance methods for lm_robust objects (see here for details on how to make use custom glance methods in modelsumary):

glance_custom.lm_robust <- function(x) {
  # this function takes glance() output, and adds a nclusters column
  glance(x) %>%
  mutate(nclusters = x$nclusters)
}

modelsummary(list("LM" = lm, "Robust" = lmrobust), 
             gof_map = c("nobs","r.squared","adj.r.squared", "nclusters"),
             output = "flextable")

LM

Robust

(Intercept)

0.207

0.207

(0.057)

(0.082)

treatment

0.265

0.265

(0.083)

(0.105)

educated

0.128

0.128

(0.088)

(0.101)

Num.Obs.

131

131

R2

0.089

0.089

R2 Adj.

0.074

0.074

Num.Clust.

3

Now lets add a probit model!

probit <- glm(uses_fertilizer ~ treatment + educated, 
                family = binomial(link = "probit"), 
                data = data)

modelsummary(list("LM" = lm, "Robust" = lmrobust, "Probit"  = probit), 
             gof_map = c("nobs","r.squared","adj.r.squared", "nclusters"),
             output = "flextable")

LM

Robust

Probit

(Intercept)

0.207

0.207

-0.813

(0.057)

(0.082)

(0.173)

treatment

0.265

0.265

0.727

(0.083)

(0.105)

(0.236)

educated

0.128

0.128

0.366

(0.088)

(0.101)

(0.250)

Num.Obs.

131

131

131

R2

0.089

0.089

R2 Adj.

0.074

0.074

Num.Clust.

3

Adding cluster-robust standard errors to the probit model is a bit more complex. There is no glm_robust() function. You can estimate a probit model, then use coeftest() to get clustered standard errors, and then use the glance_custom method to add the number of clusters to that.

However, you can create your own glm_robust function, with blackjack and hookers including tidy() and glance() methods that return everything you want to modelsummary(). I use vcovCR from the clubSandwich package to compute CR2 standard errors, which are the default in estimatr, so everything is nice and consistent with the linear probability model.

# this function estimates a probit model, and 
# then computes the cluster-robust standad errors using
# clubSandwich and coeftest
# it returns a glm_robust object, which is just a modified 
# coeftest object
glm_robust <- function(formula,family,data,cluster, type = "CR2") {
  library(lmtest)
  library(clubSandwich)
 
  probit <- 
    glm(formula, 
        family = family, 
        data = data)

  model <-
    probit%>%
    coeftest(.,
             vcovCR({.}, 
                    cluster = data[[cluster]], 
                    type = type), 
              save = TRUE) 


  # this computes the number of unique clusters in the data used
  # for the original mode
  attr(model,"nclusters") <- 
    data[row.names(model.frame(probit)),cluster] %>% unique() %>% nrow()

  class(model) <- "glm_robust"
  model
}

# this is the custom tidy methods for glm_robust objects
# it returns a dataframe with coefficients
tidy.glm_robust <- function(x, ...){
  x[,] %>% as_tibble() %>%
  mutate(term = attr(x,"dimnames")[[1]]) %>%
  select(term,
         estimate = Estimate,
         std.error = `Std. Error`,
         statistic = `z value`,
         p.value = `Pr(>|z|)`) 
}


# this is the glance method. It returns a data frame with 
# the number of obserations, log likelihood and number of clusters
glance.glm_robust <- function(x, ...){
  tibble(nobs = attr(x,"nobs"),
         logLik = as.numeric(attr(x,"logLik")),
        nclusters = attr(x,"nclusters")
        )
}

You can then simply use the glm_robust() function and modelsummary() will know how to handle its output!

probitrobust <- 
  glm_robust(uses_fertilizer ~ treatment + educated, 
             family = binomial(link = "probit"), 
             data = data, cluster="village", type = "CR2")

modelsummary(list("LM" = lm, "LM Robust" = lmrobust, 
                  "Probit"  = probit, "Probit Robust" = probitrobust), 
             gof_map = c("nobs","r.squared","adj.r.squared", "nclusters"),
             coef_map = labels,
             stars = TRUE,
             output = "flextable") %>%
  autofit()

LM

LM Robust

Probit

Probit Robust

Treated

0.265**

0.265*

0.727**

0.727*

(0.083)

(0.105)

(0.236)

(0.331)

Educated

0.128

0.128

0.366

0.366

(0.088)

(0.101)

(0.250)

(0.273)

Constant

0.207***

0.207*

-0.813***

-0.813**

(0.057)

(0.082)

(0.173)

(0.279)

Num.Obs.

131

131

131

131

R2

0.089

0.089

R2 Adj.

0.074

0.074

Num.Clust.

3

3

+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001