install.packages('pwr')
install.packages('glue')

library(pwr)
library(glue)

alpha <- 0.05            # Significance level
power <- 0.80            # Statistical power (Probability of detecting an effect when it exists; 0.8 is standard)
control <- 0.04          # Baseline rate
effect <- 0.15           # Desired relative effect (e.g., 5% lift over baseline)
mde <- control * effect  # Minimum Detectable Effect (MDE): diff you want to detect in absolute terms
treatment= control + mde # Treatment rate (includes effect)
print(paste('Control:',control))
print(paste('Treatment:',treatment))

[1] "Control: 0.04"
[1] "Treatment: 0.046"

p_1=treatment
p_2=control
p1_label = "Treatment"
p2_label = "Control"

alternative = "greater"
# in reference to p1:
  # p1 is "greater" than p2
  # p1 is "less" than p2
  # p1 is different from ()"two.sided" p2

hypothesis <- switch(alternative,
  greater = sprintf("%s (%.4f) is greater than %s (%.4f)", p1_label, p_1, p2_label, p_2),
  less = sprintf("%s (%.4f) is less than %s (%.4f)", p1_label, p_1, p2_label, p_2),
  two.sided = sprintf("%s (%.4f) is different from %s (%.4f)", p1_label, p_1, p2_label, p_2),
)

cat("Hypothesis:",hypothesis)

Hypothesis: Treatment (0.0460) is greater than Control (0.0400)

# Cohen's h (standardized effect size for proportions)

effect_size = ES.h(treatment, control)

cat(sprintf("Minimum Detectable Effect (MDE): %.3f\n", mde))
cat(sprintf("Effect Size (Cohen's h): %.3f\n", effect_size))

Minimum Detectable Effect (MDE): 0.006
Effect Size (Cohen's h): 0.030

# Determine the minimum number of samples for each group

# pwr.2p.test requires inputting the effect size
result1 <- pwr.2p.test(h=effect_size, sig.level=alpha, power=power,alternative=alternative)

# Inputting effect
cat("Inputting effect:\n")
cat(paste("(alternative)", alternative, ": n =", round(result1$n)), "\n")

Inputting effect:
(alternative) greater : n = 14118

n_observations_control <- 14500
n_observations_treatment <- 14550

conversions_control <- 582
conversions_treatment <- 675

n1 <- n_observations_treatment
n2 <- n_observations_control

print(p1_label) # Set above in test design
print(p2_label)

[1] "Treatment"
[1] "Control"

conv_rate_control = (conversions_control / n_observations_control)
conv_rate_treatment = (conversions_treatment / n_observations_treatment)

p1=conv_rate_treatment # Assign p1 vs. p2, test alternative references p1
p2=conv_rate_control

c1=conversions_treatment
c2=conversions_control

n1=n_observations_treatment
n2=n_observations_control

print(glue("Control Conversion Rate: {round(conv_rate_control * 100, 2)}%"))
print(glue("Treatment Conversion Rate: {round(conv_rate_treatment * 100, 2)}%"))

Control Conversion Rate: 4.01%
Treatment Conversion Rate: 4.64%

result_hypothesis <- switch(alternative,
  greater = sprintf("%s (%.4f) is greater than %s (%.4f)", p1_label, p1, p2_label, p2),
  less = sprintf("%s (%.4f) is less than %s (%.4f)", p1_label, p1, p2_label, p2),
  two.sided = sprintf("%s (%.4f) is different from %s (%.4f)", p1_label, p1, p2_label, p2),
)

cat("Result Hypothesis:",result_hypothesis)

Result Hypothesis: Treatment (0.0464) is greater than Control (0.0401)

# Uplift
uplift = (p1 - p2) / p2

# Absolute Difference
abs_diff = abs(p1 - p2)

# Cohen's h function
proportion_effectsize <- function(p1, p2) {
  2 * asin(sqrt(p1)) - 2 * asin(sqrt(p2))
}

h <- proportion_effectsize(p1, p2)

# Interpret effect size
interpret_h <- function(h) {
  if (abs(h) < 0.2) return("negligible")
  if (abs(h) < 0.5) return("small")
  if (abs(h) < 0.8) return("medium")
  return("large")
}
cat(sprintf("Absolute difference: %.3f (%.1f%%)\n", abs_diff, abs_diff * 100))
print(glue("Uplift: {round(uplift * 100, 2)}%"))
cat(sprintf("Cohen's h: %.3f\n", h))
cat(sprintf("Effect size interpretation: %s\n", interpret_h(h)))

Absolute difference: 0.006 (0.6%)
Uplift: 15.58%
Cohen's h: 0.031
Effect size interpretation: negligible

# Vectorize successes and totals for statistical test
x <- c(c1, c2)  # successes
n <- c(n1, n2)  # totals

# Run two-proportion test
# Correction not needed with large sample size
test_result <- prop.test(x = x, n = n, alternative = alternative, correct = FALSE)

print(test_result)

	2-sample test for equality of proportions without continuity correction

data:  x out of n
X-squared = 6.8612, df = 1, p-value = 0.004404
alternative hypothesis: greater
95 percent confidence interval:
 0.002327633 1.000000000
sample estimates:
    prop 1     prop 2 
0.04639175 0.04013793

p_pool <- (c1 + c2) / (n1 + n2)

se_pool <- sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))

z_stat <- (p1 - p2) / se_pool

# One-tailed test p1 > p2: p-value is the upper tail probability
  # 1 - (cume probability up to test statistic z under the standard normal distribution: P(Z≤zstat))
  # If z stat is large and positive, the p-value will be small
p_value_one_tailed <- 1 - pnorm(z_stat)

cat("Manual One-tailed Z test:\n")
cat("Z =", z_stat, "\n")
cat("P-value =", p_value_one_tailed, "\n")

Manual One-tailed Z test:
Z = 2.619381 
P-value = 0.004404473

# Extract p-value from result
p_value <- test_result$p.value

print(sprintf("p-value: %.4f", p_value))

[1] "p-value: 0.0044"

# Difference in proportions
diff <- p1 - p2

# Critical z value for 95% confidence (one-tailed 95% test)
z <- qnorm(0.95)  # 95% quantile for one-sided CI

# Standard error of difference (unpooled)
se_diff <- sqrt((p1 * (1 - p1) / n1) + (p2 * (1 - p2) / n2))

# Margin of error
moe <- z * se_diff

# One-tailed confidence interval (lower bound only, since testing p1 > p2)
lower <- diff - moe
upper <- Inf  # Upper bound unbounded (infinity) in one-tailed CI for p1 > p2

cat("One-tailed 95% Confidence Interval (unpooled):", lower, "to", upper, "\n")

One-tailed 95% Confidence Interval (unpooled): 0.002327633 to Inf

# Calculate confidence interval for conversion rate:
se_p1 <- sqrt(p1 * (1 - p1) / n1)
lower_ci_p1 <- p1 - 1.96 * se_p1
upper_ci_p1 <- p1 + 1.96 * se_p1

cat(sprintf("%s 95%% CI: %.4f to %.4f\n",p1_label, lower_ci_p1, upper_ci_p1))

se_p2 <- sqrt(p2 * (1 - p2) / n2)
lower_ci_p2 <- p2 - 1.96 * se_p2
upper_ci_p2 <- p2 + 1.96 * se_p2

cat(sprintf("%s 95%% CI: %.4f to %.4f\n",p2_label, lower_ci_p2, upper_ci_p2))

Treatment 95% CI: 0.0430 to 0.0498
Control 95% CI: 0.0369 to 0.0433

# Effective sample size (harmonic mean for unequal n)
n_effective <- (2 * n1 * n2) / (n1 + n2)

# Calculate power
power_result <- pwr.2p.test(h = h, n = n_effective, sig.level = alpha, alternative = alternative)
print(power_result)

     Difference of proportion power calculation for binomial distribution (arcsine transformation) 

              h = 0.03075801
              n = 14524.96
      sig.level = 0.05
          power = 0.8355543
    alternative = greater

NOTE: same sample sizes

# Extract the power
power_pct <- round(power_result$power * 100, 1)

cat("Result Power:", power_pct, "%\n\n")

Result Power: 83.6 %

print(glue("Control Conversion Rate: {round(conv_rate_control * 100, 2)}%"))
print(glue("Treatment Conversion Rate: {round(conv_rate_treatment * 100, 2)}%"))
cat("\n")
print(paste("Result Hypothesis:",result_hypothesis))
cat("\n")
cat(sprintf("Absolute difference: %.3f (%.1f%%)\n", abs_diff, abs_diff * 100))
print(glue("Uplift: {round(uplift * 100, 2)}%"))
cat(sprintf("Cohen's h: %.3f\n", h))
cat(sprintf("Effect size interpretation: %s\n", interpret_h(h)))
print(test_result)
print(sprintf("p-value: %.4f", p_value))
cat("\n")
cat(sprintf("%s 95%% CI: %.4f to %.4f\n",p1_label, lower_ci_p1, upper_ci_p1))
cat(sprintf("%s 95%% CI: %.4f to %.4f\n",p2_label, lower_ci_p2, upper_ci_p2))
cat("\n")
cat("One-tailed 95% Confidence Interval for Diff (unpooled):", lower, "to", upper, "\n")
cat("\n")
cat("Result Power:", power_pct, "%\n\n")

Control Conversion Rate: 4.01%
Treatment Conversion Rate: 4.64%

[1] "Result Hypothesis: Treatment (0.0464) is greater than Control (0.0401)"

Absolute difference: 0.006 (0.6%)
Uplift: 15.58%
Cohen's h: 0.031
Effect size interpretation: negligible

	2-sample test for equality of proportions without continuity correction

data:  x out of n
X-squared = 6.8612, df = 1, p-value = 0.004404
alternative hypothesis: greater
95 percent confidence interval:
 0.002327633 1.000000000
sample estimates:
    prop 1     prop 2 
0.04639175 0.04013793 

[1] "p-value: 0.0044"

Treatment 95% CI: 0.0430 to 0.0498
Control 95% CI: 0.0369 to 0.0433

One-tailed 95% Confidence Interval for Diff (unpooled): 0.002327633 to Inf 

Result Power: 83.6 %

Run test:¶

Confirming with manual version:¶

Confidence interval for difference in proportions (unpooled):¶

Confidence Interval for conversion rate (unpooled):¶