install.packages("stargazer")
install.packages("ggplot2")
install.packages("car")
install.packages("broom")
install.packages("ggthemes")
install.packages("showtext")
install.packages("dplyr")
install.packages("rdrobust")
install.packages("rddensity")

library(stargazer)
library(ggthemes)
library(ggplot2)
library(dplyr)
library(showtext)
library(rdrobust)
library(rddensity)
library(car)
library(broom)

set.seed(20)

# Parameters
n_customers <- 200000         # Total number of customers
cutoff <- 125                 # Free shipping (on first order) threshold
effect_orders <- 6            # Extra orders due to free shipping
noise_sd_orders <- 1          # Random noise in orders

# Define products and their price ranges
apparel_products <- list(
  "T-Shirt" = c(25, 75),
  "Jeans"   = c(65, 200),
  "Jacket"  = c(45, 130),
  "Sweater" = c(60, 150),
  "Shoes"   = c(35, 200),
  "Hat"     = c(15, 75),
  "Dress"   = c(45, 150),
  "Socks"   = c(10, 30),
  "Shorts"  = c(45, 100)
)

# Extract product names
product_names <- names(apparel_products)

# List to store each customer's data
customer_data <- vector("list", n_customers)

# Loop over each customer
for (customer_id in 1:n_customers) {

  # Randomly choose number of items in first order (1–3)
  num_items <- sample(1:3, 1)

  # Randomly select products for the basket
  chosen_products <- sample(product_names, num_items, replace = TRUE)

  # Assign a random price to each chosen product
  prices <- sapply(chosen_products, function(prod) {
    runif(1, apparel_products[[prod]][1], apparel_products[[prod]][2])
  })
  # Selects the lower and upper bounds of the product's price range
  # Generates random numbers from a uniform distribution

  # Compute total purchase amount for first order
  purchase_amount <- sum(prices)

  # Determine free shipping eligibility (1 = yes, 0 = no)
  first_order_free_shipping <- as.integer(purchase_amount >= cutoff)

  # Base number of orders: random component + scaled purchase amount
  base_orders <- runif(1, 0, 20) + 0.14 * purchase_amount / 10

  # Total orders in first year: add free shipping effect + random noise
  next_orders <- base_orders + effect_orders * first_order_free_shipping + rnorm(1, 0, noise_sd_orders)

  # Ensure orders are non-negative integers
  next_orders <- round(pmax(0, next_orders))

  # Store the customer's simulated data in a data.frame
  customer_data[[customer_id]] <- data.frame(
    customer_id = customer_id,
    first_order_amount = purchase_amount,
    addtl_orders_first_year = next_orders
  )
}

# Combine all customers into a single dataframe
df_all_data <- do.call(rbind, customer_data)

# Add categorical free_shipping variable ("yes"/"no")
df_all_data <- df_all_data %>%
  mutate(first_order_free_shipping = ifelse(first_order_amount >= cutoff, "yes", "no"))

# Generate 100 random ages between 18 and 75
age <- sample(18:75, size = n_customers, replace = TRUE)

head(age)

df_all_data<- cbind(df_all_data, age)

table(df_all_data$first_order_free_shipping)

    no    yes 
 78546 121454

head(df_all_data)

summary(df_all_data)

  customer_id     first_order_amount addtl_orders_first_year
 Min.   :     1   Min.   : 10.00     Min.   : 0.00          
 1st Qu.: 50001   1st Qu.: 89.16     1st Qu.:11.00          
 Median :100000   Median :150.50     Median :16.00          
 Mean   :100000   Mean   :161.71     Mean   :15.91          
 3rd Qu.:150000   3rd Qu.:222.28     3rd Qu.:21.00          
 Max.   :200000   Max.   :579.46     Max.   :34.00          
 first_order_free_shipping      age       
 Length:200000             Min.   :18.00  
 Class :character          1st Qu.:32.00  
 Mode  :character          Median :46.00  
                           Mean   :46.49  
                           3rd Qu.:61.00  
                           Max.   :75.00

font_add_google("Abel", "abel")
showtext_auto()

ggplot(df_all_data, aes(x = addtl_orders_first_year)) +
  geom_histogram(bins = 35) +
  labs(title = "Additional Orders in First Year",
       x = "Additional Orders First Year",
       y = "Count") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())  +
theme(
  plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
  axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
  axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
  legend.position = "top",
  legend.title = element_text(size = 14, face = "bold", family = "abel"),
  legend.text = element_text(size = 12, family = "abel"),
  axis.text.x = element_text(size = 12, family = "abel"),
  axis.text.y = element_text(size = 12, family = "abel")
)

font_add_google("Abel", "abel")
showtext_auto()

ggplot(df_all_data, aes(x = first_order_amount)) +
  geom_histogram(bins = 30) +
  labs(title = "First Order Amount",
       x = "First Order Amount",
       y = "Count") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())  +
theme(
  plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
  axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
  axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
  legend.position = "top",
  legend.title = element_text(size = 14, face = "bold", family = "abel"),
  legend.text = element_text(size = 12, family = "abel"),
  axis.text.x = element_text(size = 12, family = "abel"),
  axis.text.y = element_text(size = 12, family = "abel")
)

ggplot(df_all_data, aes(x = addtl_orders_first_year, fill = first_order_free_shipping)) +
  geom_histogram(position = "dodge", bins = 35) +
  facet_wrap(~ first_order_free_shipping) +
  labs(title = "Additional Orders First Year by Free Shipping") +
scale_fill_manual(values = c("seashell3", "darkslategray3")) +
theme_minimal() +
theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())  +
theme(
  plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
  axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
  axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
  legend.position = "top",
  legend.title = element_text(size = 14, face = "bold", family = "abel"),
  legend.text = element_text(size = 12, family = "abel"),
  axis.text.x = element_text(size = 12, family = "abel"),
  axis.text.y = element_text(size = 12, family = "abel")
)

rd_res <- rdrobust(y = df_all_data$addtl_orders_first_year, x = df_all_data$first_order_amount, c = cutoff,p=1)
summary(rd_res)

Sharp RD estimates using local polynomial regression.

Number of Obs.               200000
BW type                       mserd
Kernel                   Triangular
VCE method                       NN

Number of Obs.                78546       121454
Eff. Number of Obs.           37249        37383
Order est. (p)                    1            1
Order bias  (q)                   2            2
BW est. (h)                  47.274       47.274
BW bias (b)                  71.409       71.409
rho (h/b)                     0.662        0.662
Unique Obs.                   78546       121454

=====================================================================
                   Point    Robust Inference
                Estimate         z     P>|z|      [ 95% C.I. ]       
---------------------------------------------------------------------
     RD Effect     6.022    54.059     0.000     [5.804 , 6.241]     
=====================================================================

rd_res$bws

left_bw<-rd_res$bws["h", "left"]

right_bw<-rd_res$bws["h", "right"]

df<-df_all_data %>%
  filter(first_order_amount >= cutoff - left_bw,
         first_order_amount <= cutoff + right_bw)

table(df$first_order_free_shipping)

   no   yes 
37249 37383

suppressWarnings({
  font_add_google("Abel", "abel")
  showtext_auto()

  ggplot(df, aes(x = first_order_amount, fill = first_order_free_shipping)) +
    geom_histogram(binwidth = 2, alpha = 0.7, position = "identity") +
    geom_vline(xintercept = cutoff, color = "black", linetype = "dashed") +
    labs(title = "Customer Counts",
         x = "First Order Amount",
         y = "Number of Customers") +
    scale_fill_manual(values = c("seashell3", "darkslategray3")) +
    theme_minimal() +
    annotate(
      "text", x = cutoff, y = max(df$y, na.rm = TRUE), label = "Cutoff ($125)", angle = 90,
      vjust = 2, hjust = 0, size = 6, family = "abel", fontface = "bold"
    ) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    theme(
      plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
      axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
      axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
      legend.position = "top",
      legend.title = element_text(size = 14, face = "bold", family = "abel"),
      legend.text = element_text(size = 12, family = "abel"),
      axis.text.x = element_text(size = 12, family = "abel"),
      axis.text.y = element_text(size = 12, family = "abel")
    )
})

# Create bins
df <- df %>%
  mutate(bin = cut(first_order_amount, breaks = seq(0, max(first_order_amount) + 5, by = 5)))

# Create bin means
bin_means <- df %>%
  group_by(bin, first_order_free_shipping) %>%
  summarise(
    avg_purchase = mean(first_order_amount),
    avg_orders = mean(addtl_orders_first_year),
    .groups = "drop"
  )

# Running variable, has cutoff
df$x=df$first_order_amount

# Response variable - trying to establish significance of this vs. treatment
df$y=df$addtl_orders_first_year

# (Treatment is x after cutoff (Di=1 if X>=cutoff))

print(head(df))

  customer_id first_order_amount addtl_orders_first_year
1           1          106.93699                       2
2           3          135.03639                       9
3           5          129.12330                      18
4           6          101.95486                      17
5           7           80.46316                      19
6           9          108.50146                       5
  first_order_free_shipping age       bin         x  y
1                        no  73 (105,110] 106.93699  2
2                       yes  69 (135,140] 135.03639  9
3                       yes  53 (125,130] 129.12330 18
4                        no  56 (100,105] 101.95486 17
5                        no  52   (80,85]  80.46316 19
6                        no  74 (105,110] 108.50146  5

cutoff_label_y <- max(df$y, na.rm = TRUE)

  ggplot(df, aes(x = x, y = y, color = factor(first_order_free_shipping))) +
    geom_point(alpha = 0.6) +
    geom_point(data = bin_means, aes(x = avg_purchase, y = avg_orders, color = first_order_free_shipping), size = 5) +
    scale_color_manual(values = c("no" = "seashell3", "yes" = "darkslategray3")) +
    geom_vline(xintercept = cutoff, linetype = "dashed", color = "black") +
    annotate(
      "text", x = cutoff, y = cutoff_label_y, label = "Cutoff ($125)", angle = 90,
      vjust = -0.4, hjust = 1, size = 6, family = "abel", fontface = "bold"
    ) +
    geom_smooth(
      data = filter(df, x < cutoff),
      mapping = aes(x = x, y = y),
      method = "lm", formula = y ~ x, se = FALSE, color = "seashell3"
    ) +
    geom_smooth(
      data = filter(df, x >= cutoff),
      mapping = aes(x = x, y = y),
      method = "lm", formula = y ~ x, se = FALSE, color = "darkslategray3"
    ) +
    labs(
      x = "First Order Amount",
      y = "Orders in First Year",
      color = "First Order Free Shipping"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
      axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
      axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
      legend.position = "top",
      legend.title = element_text(size = 14, face = "bold", family = "abel"),
      legend.text = element_text(size = 12, family = "abel"),
      axis.text.x = element_text(size = 12, family = "abel"),
      axis.text.y = element_text(size = 12, family = "abel")
    ) +
    ggtitle("Regression Discontinuity Plot")

model1<-lm(addtl_orders_first_year~first_order_amount+first_order_free_shipping+first_order_amount:first_order_free_shipping,data=df)

stargazer(model1,type="text",style="aer",
          column.labels=c("Y~X+I(X>Cutoff)+X*I(X>Cutoff)"),
          dep.var.labels="Regression Discontinuity",
          omit.stat=c("f","ser","rsq","n","adj.rsq"),
          intercept.bottom=F)

======================================================================================
                                                       Regression Discontinuity       
                                                    Y~X+I(X>Cutoff)+X*I(X>Cutoff)     
--------------------------------------------------------------------------------------
Constant                                                      10.049***               
                                                               (0.228)                
                                                                                      
first_order_amount                                             0.014***               
                                                               (0.002)                
                                                                                      
first_order_free_shippingyes                                   6.095***               
                                                               (0.403)                
                                                                                      
first_order_amount:first_order_free_shippingyes                 -0.001                
                                                               (0.003)                
                                                                                      
--------------------------------------------------------------------------------------
Notes:                                          ***Significant at the 1 percent level.
                                                **Significant at the 5 percent level. 
                                                *Significant at the 10 percent level.

print(coef(model1))

                                    (Intercept) 
                                  10.0489434694 
                             first_order_amount 
                                   0.0136018246 
                   first_order_free_shippingyes 
                                   6.0953882862 
first_order_amount:first_order_free_shippingyes 
                                  -0.0006212591

cutoff <- 125
linearHypothesis(model1, c(paste0("first_order_free_shippingyes + ", cutoff, " * first_order_amount:first_order_free_shippingyes = 0")))

coef_names <- names(coef(model1))
print(coef_names)

[1] "(Intercept)"                                    
[2] "first_order_amount"                             
[3] "first_order_free_shippingyes"                   
[4] "first_order_amount:first_order_free_shippingyes"

b2_name <- grep("^first_order_free_shippingyes$", coef_names, value = TRUE)
b3_name <- grep("^first_order_amount:first_order_free_shippingyes$", coef_names, value = TRUE)

b2 <- coef(model1)[b2_name]
b3 <- coef(model1)[b3_name]

causal_impact <- b2 + b3 * cutoff
print(paste("causal_impact of free shipping (on first order):",round(causal_impact,2), "incremental orders"))

[1] "causal_impact of free shipping (on first order): 6.02 incremental orders"

# Function to run RD with multiple bandwidth based on a list of scales
rd_sensitivity <- function(y, x, cutoff, p = 1, scales = c(0.5, 1, 1.5, 2, 2.5, 3)) {
  # First run to get optimal bandwidths
  rd_main <- rdrobust(y, x, c = cutoff, p = p)
  h_opt <- rd_main$bws[1]   # optimal estimation bandwidth
  b_opt <- rd_main$bws[2]   # optimal bias bandwidth

  # Store results
  results <- data.frame(
    scale = numeric(),
    h = numeric(),
    b = numeric(),
    estimate = numeric(),
    se = numeric(),
    ci_lower = numeric(),
    ci_upper = numeric(),
    p_value = numeric(),
    stringsAsFactors = FALSE
  )

  for (s in scales) {
    # Scale both h and b by chosen scales above
    h_new <- h_opt * s
    b_new <- b_opt * s

    rd_out <- rdrobust(y, x, c = cutoff, p = p, h = h_new, b = b_new)
    # Index 3 is the position of the bias-corrected versions of the following
    # from rdrobust's rd_out output:
    ci <- rd_out$ci[3, ]   # bias-corrected CI
    est <- rd_out$coef[3]  # bias-corrected estimate
    se  <- rd_out$se[3]    # bias-corrected SE
    pval <- rd_out$pv[3]   # bias-corrected p-value

    results <- rbind(results,
                     data.frame(scale = s,
                                h = h_new,
                                b = b_new,
                                estimate = est,
                                se = se,
                                ci_lower = ci[1],
                                ci_upper = ci[2],
                                p_value = pval))
  }

  return(list(main = rd_main, sensitivity = results))
}

# Run scales in sensitivity function
rd_res <- rd_sensitivity(y = df_all_data$addtl_orders_first_year,
                         x = df_all_data$first_order_amount,
                         cutoff = cutoff,
                         p = 1,
                         scales = c(0.5, 1, 1.5, 2, 2.5, 3))

# Show results
rd_res$sensitivity

plot_rd_sensitivity <- function(results) {
  ggplot(results, aes(x = h, y = estimate)) +
    geom_point(size = 3, color = "black") +
    geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, color = "black") +
    geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
    scale_x_continuous(name = "Estimation bandwidth (h)") +
    scale_y_continuous(name = "Estimated treatment effect") +
    ggtitle("RD Sensitivity Analysis: Effect Across Bandwidths") +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
      axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
      axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
      legend.position = "top",
      legend.title = element_text(size = 14, face = "bold", family = "abel"),
      legend.text = element_text(size = 12, family = "abel"),
      axis.text.x = element_text(size = 12, family = "abel"),
      axis.text.y = element_text(size = 12, family = "abel")
    )
}

plot_rd_sensitivity(rd_res$sensitivity)

rd_res <- rdrobust(y = df$addtl_orders_first_year, x = df$first_order_amount, c = cutoff,p=2)
summary(rd_res)

Sharp RD estimates using local polynomial regression.

Number of Obs.                74632
BW type                       mserd
Kernel                   Triangular
VCE method                       NN

Number of Obs.                37249        37383
Eff. Number of Obs.           16360        17021
Order est. (p)                    2            2
Order bias  (q)                   3            3
BW est. (h)                  20.348       20.348
BW bias (b)                  29.487       29.487
rho (h/b)                     0.690        0.690
Unique Obs.                   37249        37383

=====================================================================
                   Point    Robust Inference
                Estimate         z     P>|z|      [ 95% C.I. ]       
---------------------------------------------------------------------
     RD Effect     5.867    25.354     0.000     [5.363 , 6.262]     
=====================================================================

out_p1 <- rdplot(y = df$addtl_orders_first_year, x = df$first_order_amount, c = cutoff,
      binselect = "esmv", kernel = "triangular", p=1,
      col.dots = "black",
      col.lines = "darkslategray3", hide=TRUE)

font_add_google("Abel", "abel")
showtext_auto()

out_p1$rdplot$layers[[2]]$aes_params$size <- 1.5  # left regression line
out_p1$rdplot$layers[[3]]$aes_params$size <- 1.5  # right regression line

out_p1$rdplot +
  theme_minimal() +
  theme(
    text = element_text(family = "abel"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 12)
  ) +
  ggtitle("RDD Linearity Plot")

out_p2 <- rdplot(y = df$addtl_orders_first_year, x = df$first_order_amount, c = cutoff,
      binselect = "esmv", kernel = "triangular", p=2,
      col.dots = "black",   # dot color
      col.lines = "darkslategray3", hide=TRUE)    # regression line color

font_add_google("Abel", "abel")
showtext_auto()

out_p2$rdplot$layers[[2]]$aes_params$size <- 1.5  # left regression line
out_p2$rdplot$layers[[3]]$aes_params$size <- 1.5  # right regression line

out_p2$rdplot +
  theme_minimal() +
  theme(
    text = element_text(family = "abel"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 12)
  ) +
  ggtitle("RDD Linearity Plot")

balance_test_age <- lm(age ~ first_order_amount + first_order_free_shipping + first_order_amount:first_order_free_shipping, data = df)
summary(balance_test_age)

Call:
lm(formula = age ~ first_order_amount + first_order_free_shipping + 
    first_order_amount:first_order_free_shipping, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-28.8458 -14.4937   0.1709  14.5542  28.6880 

Coefficients:
                                                 Estimate Std. Error t value
(Intercept)                                     45.987895   0.652188  70.513
first_order_amount                               0.004168   0.006354   0.656
first_order_free_shippingyes                    -0.961620   1.151474  -0.835
first_order_amount:first_order_free_shippingyes  0.006401   0.009012   0.710
                                                Pr(>|t|)    
(Intercept)                                       <2e-16 ***
first_order_amount                                 0.512    
first_order_free_shippingyes                       0.404    
first_order_amount:first_order_free_shippingyes    0.478    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 16.79 on 74628 degrees of freedom
Multiple R-squared:  7.023e-05,	Adjusted R-squared:  3.003e-05 
F-statistic: 1.747 on 3 and 74628 DF,  p-value: 0.155

# Compute quartiles
quartiles_df <- df %>%
  group_by(first_order_free_shipping) %>%
  summarise(
    Q1 = quantile(age, 0.25),
    Median = quantile(age, 0.5),
    Q3 = quantile(age, 0.75)
  )

# Plot with quartile labels
ggplot(df, aes(x = first_order_free_shipping, y = age, fill = first_order_free_shipping)) +
  geom_boxplot(alpha = 0.6) +
  scale_fill_manual(values = c("no" = "seashell3", "yes" = "darkslategray3")) +
  geom_text(data = quartiles_df, aes(x = first_order_free_shipping, y = Q1, label = round(Q1, 1)), vjust = -0.5) +
  geom_text(data = quartiles_df, aes(x = first_order_free_shipping, y = Median, label = round(Median, 1)), vjust = -0.5, fontface = "bold") +
  geom_text(data = quartiles_df, aes(x = first_order_free_shipping, y = Q3, label = round(Q3, 1)), vjust = -0.5) +
  labs(title = "Boxplot of Age by Free Shipping Status",
       x = "Free Shipping on First Order", y = "Age") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
    axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
    axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
    legend.position = "top",
    legend.title = element_text(size = 14, face = "bold", family = "abel"),
    legend.text = element_text(size = 12, family = "abel"),
    axis.text.x = element_text(size = 12, family = "abel"),
    axis.text.y = element_text(size = 12, family = "abel")
  )

ggplot(df, aes(x = age, fill = first_order_free_shipping)) +
  geom_density(alpha = 0.4) +
  labs(title = "Density of Age by Free Shipping Group",
       x = "Age", y = "Density") +
  scale_fill_manual(values = c("seashell3", "darkslategray3")) +
  theme_minimal() +
theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())  +
theme(
  plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
  axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
  axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
  legend.position = "top",
  legend.title = element_text(size = 14, face = "bold", family = "abel"),
  legend.text = element_text(size = 12, family = "abel"),
  axis.text.x = element_text(size = 12, family = "abel"),
  axis.text.y = element_text(size = 12, family = "abel")
)

ggplot(df, aes(x = first_order_amount, y = age)) +
    geom_point(alpha = 0.1, color = "lightgrey") +  # faded light grey points
    geom_smooth(method = "loess", formula = y ~ x, se = FALSE, color = "darkslategray3", linewidth = 2) +  # thick dark line
    geom_vline(xintercept = cutoff, linetype = "dashed", color = "black", linewidth = 1) +
    labs(
      title = "Age vs. Running Variable",
      x = "First order amount",
      y = "Age"
    ) +
    theme_minimal() +
    annotate(
      "text", x = cutoff, y = max(df$y, na.rm = TRUE), label = "Cutoff ($125)", angle = 90,
      vjust = 2, hjust = 0, size = 6, family = "abel", fontface = "bold"
    ) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    theme(
      plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
      axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
      axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
      legend.position = "top",
      legend.title = element_text(size = 14, face = "bold", family = "abel"),
      legend.text = element_text(size = 12, family = "abel"),
      axis.text.x = element_text(size = 12, family = "abel"),
      axis.text.y = element_text(size = 12, family = "abel")
    )

x <- df$first_order_amount

# McCrary density discontinuity test
rdd_out <- rddensity(x, c = cutoff)

summary(rdd_out)

# Optimal bandwidths (left & right) for density estimation
rdd_out$h

Manipulation testing using local polynomial density estimation.

Number of obs =       74632
Model =               unrestricted
Kernel =              triangular
BW method =           estimated
VCE method =          jackknife

c = 125               Left of c           Right of c          
Number of obs         37249               37383               
Eff. Number of obs    8551                6148                
Order est. (p)        2                   2                   
Order bias (q)        3                   3                   
BW est. (h)           10.321              7.296               

Method                T                   P > |T|             
Robust                -1.3472             0.1779              


P-values of binomial tests (H0: p=0.5).

Window Length / 2          <c     >=c    P>|T|
0.029                      22      20    0.8776
0.058                      50      39    0.2891
0.087                      76      64    0.3526
0.116                     108      94    0.3604
0.145                     129     123    0.7529
0.174                     160     144    0.3897
0.203                     183     164    0.3339
0.232                     209     189    0.3409
0.261                     233     208    0.2531
0.290                     257     224    0.1445

densityplot <- rdplotdensity(
  rdd_out, x,
  plotRange = c(cutoff - rdd_out$hl, cutoff + rdd_out$hr),
  plotN = 25, noPlot = TRUE
)

densityplot<-densityplot$Estplot

class(densityplot)

# bars
densityplot$layers[[1]]$aes_params$fill <- "gray"      # interior color
densityplot$layers[[1]]$aes_params$color <- "black"    # border color

# ci
densityplot$layers[[2]]$aes_params$fill <- "black"
densityplot$layers[[2]]$aes_params$alpha <- 0.3

# line 1
densityplot$layers[[3]]$aes_params$colour <- "black"
densityplot$layers[[3]]$aes_params$alpha <- 0.3

# ci
densityplot$layers[[4]]$aes_params$fill <- "darkslategray3"
densityplot$layers[[4]]$aes_params$alpha <- 0.3

# line 2
densityplot$layers[[5]]$aes_params$colour <- "darkslategray3"
densityplot$layers[[5]]$aes_params$alpha <- 0.7


font_add_google("Abel", "abel")
showtext_auto()

densityplot +
  theme_minimal() +
  theme(
    text = element_text(family = "abel"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 12)
  ) +
  ggtitle("rdplot for McCrary Test")

	left	right
h	47.27355	47.27355
b	71.40938	71.40938

	Res.Df	RSS	Df	Sum of Sq	F	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	74629	2752687	NA	NA	NA	NA
2	74628	2577678	1	175008.8	5066.792	0

	scale	h	b	estimate	se	ci_lower	ci_upper	p_value
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
CI Lower	0.5	23.63678	35.70469	6.039137	0.15684587	5.731725	6.346550	0
CI Lower1	1.0	47.27355	71.40938	6.022717	0.11140968	5.804358	6.241076	0
CI Lower2	1.5	70.91033	107.11408	6.023868	0.09166236	5.844213	6.203523	0
CI Lower3	2.0	94.54711	142.81877	6.052953	0.08222827	5.891789	6.214117	0
CI Lower4	2.5	118.18388	178.52346	6.063990	0.07831533	5.910495	6.217485	0
CI Lower5	3.0	141.82066	214.22815	6.044493	0.07591735	5.895698	6.193289	0

Install Packages¶

Load Libraries¶

	customer_id	first_order_amount	addtl_orders_first_year	first_order_free_shipping	age
	<int>	<dbl>	<dbl>	<chr>	<int>
1	1	106.93699	2	no	73
2	2	55.58302	9	no	65
3	3	135.03639	9	yes	69
4	4	18.84516	13	no	61
5	5	129.12330	18	yes	53
6	6	101.95486	17	no	56