install.packages("tableone")
install.packages("Matching")
install.packages("MatchIt")
install.packages("sysfonts")
install.packages("showtext")
install.packages("ggplot2")
install.packages("optmatch")
install.packages("sensitivitymv")

library(tableone)
library(Matching)
library(MatchIt)
library(sysfonts)
library(showtext)
library(ggplot2)
library(dplyr)
library(optmatch)
library(sensitivitymv)

mydata <- read.csv("https://raw.githubusercontent.com/lindsayalexandra14/ds_portfolio/main/1_projects/statistical_analysis/matching_propensity_scores/remote_work.csv",stringsAsFactors = T)

dim(mydata)

head(mydata)

tail(mydata)

mydata <- mydata[mydata$Region == "North America", ]

mydata <- mydata[,!names(mydata)=="Region"]

mydata <- mydata[order(mydata$Work_Location), ]

mydata$Job_Role[1:30] <- "Marketing"
mydata$Physical_Activity[1:60] <- "Weekly"

mydata$Job_Role[31:60] <- "Software Engineer"
mydata$Access_to_Mental_Health_Resources[1:100] <- "Yes"

dim(mydata)

anyNA(mydata)

sapply(mydata, class)

mydata <- mydata[ , !names(mydata) %in%
                   c("Employee_ID","Productivity_Change","Satisfaction_with_Remote_Work",
                     "Company_Support_for_Remote_Work","Industry","Stress_Level","Mental_Health_Condition",
                     "Social_Isolation_Rating")]

unique(mydata["Job_Role"])

unique(mydata["Work_Location"])

unique(mydata["Access_to_Mental_Health_Resources"])

unique(mydata["Physical_Activity"])

unique(mydata["Gender"])

unique(mydata["Sleep_Quality"])

print(sapply(mydata, class))

                              Age                            Gender 
                        "integer"                          "factor" 
                         Job_Role               Years_of_Experience 
                         "factor"                         "integer" 
                    Work_Location             Hours_Worked_Per_Week 
                         "factor"                         "integer" 
       Number_of_Virtual_Meetings          Work_Life_Balance_Rating 
                        "integer"                         "integer" 
Access_to_Mental_Health_Resources                 Physical_Activity 
                         "factor"                          "factor" 
                    Sleep_Quality 
                         "factor"

mydata$mh_access<-as.numeric(mydata$Access_to_Mental_Health_Resources=="Yes")

mydata$software_engineer_role<-as.numeric(mydata$Job_Role=="Software Engineer")
mydata$sales_role<-as.numeric(mydata$Job_Role=="Sales")
mydata$data_scientist_role<-as.numeric(mydata$Job_Role=="Data Scientist")
mydata$hr_role<-as.numeric(mydata$Job_Role=="HR")
mydata$designer_role<-as.numeric(mydata$Job_Role=="Designer")
mydata$product_manager_role<-as.numeric(mydata$Job_Role=="Product Manager")
mydata$marketing_role<-as.numeric(mydata$Job_Role=="Marketing")

mydata$treat<- as.numeric(mydata$Work_Location=="Remote")

mydata$age<-mydata$Age
mydata$work_life_balance_rating<-mydata$Work_Life_Balance_Rating

mydata$female<-as.numeric(mydata$Gender=="Female")
mydata$male<-as.numeric(mydata$Gender=="Male")
mydata$nonbinary<-as.numeric(mydata$Gender=="Non-binary")
mydata$gender_prefernotsaid<-as.numeric(mydata$Gender=="Prefer not to say")

mydata$weekly_exercise<-as.numeric(mydata$Physical_Activity=="Weekly")
mydata$daily_exercise<-as.numeric(mydata$Physical_Activity=="Daily")
mydata$no_exercise<-as.numeric(mydata$Physical_Activity=="None")

mydata$good_sleep<-as.numeric(mydata$Sleep_Quality=="Good")
mydata$average_sleep<-as.numeric(mydata$Sleep_Quality=="Average")
mydata$poor_sleep<-as.numeric(mydata$Sleep_Quality=="Poor")

mydata$num_virt_mtngs<-mydata$Number_of_Virtual_Meetings
mydata$weekly_hrs_worked<-mydata$Hours_Worked_Per_Week
mydata$yrs_exp<-mydata$Years_of_Experience

xvars=c("age","mh_access","software_engineer_role","sales_role","data_scientist_role",
        "hr_role","designer_role","product_manager_role","marketing_role","female","male","nonbinary",
        "gender_prefernotsaid","weekly_exercise","daily_exercise","no_exercise",
        "good_sleep","average_sleep","poor_sleep","num_virt_mtngs","weekly_hrs_worked",
        "yrs_exp")

summary(mydata)

      Age                      Gender                 Job_Role  
 Min.   :22.00   Female           :201   Data Scientist   : 95  
 1st Qu.:30.00   Male             :199   Designer         :100  
 Median :41.00   Non-binary       :192   HR               :117  
 Mean   :40.79   Prefer not to say:185   Marketing        :130  
 3rd Qu.:50.00                           Project Manager  :112  
 Max.   :60.00                           Sales            : 96  
                                         Software Engineer:127  
 Years_of_Experience Work_Location Hours_Worked_Per_Week
 Min.   : 1.00       Hybrid:261    Min.   :20.00        
 1st Qu.: 9.00       Onsite:252    1st Qu.:29.00        
 Median :18.00       Remote:264    Median :40.00        
 Mean   :17.93                     Mean   :39.74        
 3rd Qu.:27.00                     3rd Qu.:50.00        
 Max.   :35.00                     Max.   :60.00        
                                                        
 Number_of_Virtual_Meetings Work_Life_Balance_Rating
 Min.   : 0.000             Min.   :1.00            
 1st Qu.: 4.000             1st Qu.:2.00            
 Median : 8.000             Median :3.00            
 Mean   : 7.592             Mean   :2.97            
 3rd Qu.:12.000             3rd Qu.:4.00            
 Max.   :15.000             Max.   :5.00            
                                                    
 Access_to_Mental_Health_Resources Physical_Activity Sleep_Quality
 No :352                           Daily :239        Average:260  
 Yes:425                           None  :238        Good   :253  
                                   Weekly:300        Poor   :264  
                                                                  
                                                                  
                                                                  
                                                                  
   mh_access     software_engineer_role   sales_role     data_scientist_role
 Min.   :0.000   Min.   :0.0000         Min.   :0.0000   Min.   :0.0000     
 1st Qu.:0.000   1st Qu.:0.0000         1st Qu.:0.0000   1st Qu.:0.0000     
 Median :1.000   Median :0.0000         Median :0.0000   Median :0.0000     
 Mean   :0.547   Mean   :0.1634         Mean   :0.1236   Mean   :0.1223     
 3rd Qu.:1.000   3rd Qu.:0.0000         3rd Qu.:0.0000   3rd Qu.:0.0000     
 Max.   :1.000   Max.   :1.0000         Max.   :1.0000   Max.   :1.0000     
                                                                            
    hr_role       designer_role    product_manager_role marketing_role  
 Min.   :0.0000   Min.   :0.0000   Min.   :0            Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0            1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :0            Median :0.0000  
 Mean   :0.1506   Mean   :0.1287   Mean   :0            Mean   :0.1673  
 3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0            3rd Qu.:0.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :0            Max.   :1.0000  
                                                                        
     treat             age        work_life_balance_rating     female      
 Min.   :0.0000   Min.   :22.00   Min.   :1.00             Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:30.00   1st Qu.:2.00             1st Qu.:0.0000  
 Median :0.0000   Median :41.00   Median :3.00             Median :0.0000  
 Mean   :0.3398   Mean   :40.79   Mean   :2.97             Mean   :0.2587  
 3rd Qu.:1.0000   3rd Qu.:50.00   3rd Qu.:4.00             3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :60.00   Max.   :5.00             Max.   :1.0000  
                                                                           
      male          nonbinary      gender_prefernotsaid weekly_exercise 
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000       Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000       1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :0.0000       Median :0.0000  
 Mean   :0.2561   Mean   :0.2471   Mean   :0.2381       Mean   :0.3861  
 3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000       3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000       Max.   :1.0000  
                                                                        
 daily_exercise    no_exercise       good_sleep     average_sleep   
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
 Mean   :0.3076   Mean   :0.3063   Mean   :0.3256   Mean   :0.3346  
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
                                                                    
   poor_sleep     num_virt_mtngs   weekly_hrs_worked    yrs_exp     
 Min.   :0.0000   Min.   : 0.000   Min.   :20.00     Min.   : 1.00  
 1st Qu.:0.0000   1st Qu.: 4.000   1st Qu.:29.00     1st Qu.: 9.00  
 Median :0.0000   Median : 8.000   Median :40.00     Median :18.00  
 Mean   :0.3398   Mean   : 7.592   Mean   :39.74     Mean   :17.93  
 3rd Qu.:1.0000   3rd Qu.:12.000   3rd Qu.:50.00     3rd Qu.:27.00  
 Max.   :1.0000   Max.   :15.000   Max.   :60.00     Max.   :35.00

table1<-CreateTableOne(vars=xvars,strata="treat",data=mydata, test=FALSE)
print(table1,smd=TRUE)

                                    Stratified by treat
                                     0             1             SMD   
  n                                    513           264               
  age (mean (SD))                    40.66 (11.42) 41.06 (11.39)  0.035
  mh_access (mean (SD))               0.58 (0.49)   0.48 (0.50)   0.201
  software_engineer_role (mean (SD))  0.19 (0.39)   0.12 (0.33)   0.178
  sales_role (mean (SD))              0.12 (0.33)   0.13 (0.34)   0.024
  data_scientist_role (mean (SD))     0.11 (0.31)   0.15 (0.36)   0.115
  hr_role (mean (SD))                 0.15 (0.36)   0.16 (0.36)   0.020
  designer_role (mean (SD))           0.11 (0.31)   0.17 (0.37)   0.167
  product_manager_role (mean (SD))    0.00 (0.00)   0.00 (0.00)  <0.001
  marketing_role (mean (SD))          0.19 (0.39)   0.12 (0.33)   0.193
  female (mean (SD))                  0.26 (0.44)   0.27 (0.44)   0.022
  male (mean (SD))                    0.26 (0.44)   0.25 (0.44)   0.008
  nonbinary (mean (SD))               0.26 (0.44)   0.23 (0.42)   0.057
  gender_prefernotsaid (mean (SD))    0.23 (0.42)   0.25 (0.43)   0.042
  weekly_exercise (mean (SD))         0.44 (0.50)   0.29 (0.45)   0.313
  daily_exercise (mean (SD))          0.27 (0.44)   0.39 (0.49)   0.256
  no_exercise (mean (SD))             0.30 (0.46)   0.33 (0.47)   0.064
  good_sleep (mean (SD))              0.32 (0.47)   0.34 (0.47)   0.037
  average_sleep (mean (SD))           0.35 (0.48)   0.31 (0.47)   0.065
  poor_sleep (mean (SD))              0.34 (0.47)   0.35 (0.48)   0.028
  num_virt_mtngs (mean (SD))          7.34 (4.64)   8.08 (4.66)   0.160
  weekly_hrs_worked (mean (SD))      39.66 (11.96) 39.89 (11.56)  0.019
  yrs_exp (mean (SD))                17.98 (10.16) 17.82 (10.02)  0.016

y_trt<-mydata$work_life_balance_rating[mydata$treat==1]
print(head(y_trt))

[1] 3 2 2 3 2 2

y_con<-mydata$work_life_balance_rating[mydata$treat==0]
print(head(y_con))

[1] 5 1 1 2 3 3

mean(y_trt)-mean(y_con)

print(xvars)

 [1] "age"                    "mh_access"              "software_engineer_role"
 [4] "sales_role"             "data_scientist_role"    "hr_role"               
 [7] "designer_role"          "product_manager_role"   "marketing_role"        
[10] "female"                 "male"                   "nonbinary"             
[13] "gender_prefernotsaid"   "weekly_exercise"        "daily_exercise"        
[16] "no_exercise"            "good_sleep"             "average_sleep"         
[19] "poor_sleep"             "num_virt_mtngs"         "weekly_hrs_worked"     
[22] "yrs_exp"

psmodel<-glm(treat~age + mh_access + software_engineer_role + sales_role + data_scientist_role +
        hr_role + designer_role + marketing_role + female + male +
        nonbinary + weekly_exercise + daily_exercise + weekly_hrs_worked +
        yrs_exp + num_virt_mtngs + good_sleep + average_sleep, family=binomial(), data=mydata)
summary(psmodel)

Call:
glm(formula = treat ~ age + mh_access + software_engineer_role + 
    sales_role + data_scientist_role + hr_role + designer_role + 
    marketing_role + female + male + nonbinary + weekly_exercise + 
    daily_exercise + weekly_hrs_worked + yrs_exp + num_virt_mtngs + 
    good_sleep + average_sleep, family = binomial(), data = mydata)

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)  
(Intercept)            -0.776696   0.525545  -1.478   0.1394  
age                     0.005003   0.006925   0.722   0.4700  
mh_access              -0.269276   0.159144  -1.692   0.0906 .
software_engineer_role -0.323999   0.295212  -1.098   0.2724  
sales_role             -0.029330   0.297183  -0.099   0.9214  
data_scientist_role     0.288404   0.293466   0.983   0.3257  
hr_role                 0.011547   0.282328   0.041   0.9674  
designer_role           0.430188   0.288552   1.491   0.1360  
marketing_role         -0.465510   0.290572  -1.602   0.1091  
female                 -0.059218   0.221302  -0.268   0.7890  
male                   -0.113664   0.222226  -0.511   0.6090  
nonbinary              -0.256986   0.227221  -1.131   0.2581  
weekly_exercise        -0.426727   0.197987  -2.155   0.0311 *
daily_exercise          0.316435   0.195027   1.623   0.1047  
weekly_hrs_worked       0.002655   0.006660   0.399   0.6902  
yrs_exp                -0.005850   0.007821  -0.748   0.4545  
num_virt_mtngs          0.040373   0.016991   2.376   0.0175 *
good_sleep             -0.050105   0.192017  -0.261   0.7941  
average_sleep          -0.185746   0.193171  -0.962   0.3363  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 995.93  on 776  degrees of freedom
Residual deviance: 951.75  on 758  degrees of freedom
AIC: 989.75

Number of Fisher Scoring iterations: 4

pscore<-psmodel$fitted.values

min(round(pscore,4))
max(round(pscore,4))
length(pscore)

print(coef(psmodel))

           (Intercept)                    age              mh_access 
          -0.776695947            0.005002667           -0.269276076 
software_engineer_role             sales_role    data_scientist_role 
          -0.323998785           -0.029329713            0.288404195 
               hr_role          designer_role         marketing_role 
           0.011547314            0.430188105           -0.465509825 
                female                   male              nonbinary 
          -0.059217956           -0.113663675           -0.256985968 
       weekly_exercise         daily_exercise      weekly_hrs_worked 
          -0.426727007            0.316435169            0.002654734 
               yrs_exp         num_virt_mtngs             good_sleep 
          -0.005850215            0.040372542           -0.050104998 
         average_sleep 
          -0.185746308

mydata <- cbind(mydata, pscore)

df <- data.frame(ps = pscore, treat = factor(mydata$treat))

font_add_google("Abel", "abel")
showtext_auto()

suppressWarnings(ggplot(df, aes(x = ps, fill = factor(treat))) +
  geom_histogram(
    data = subset(df, treat == 1),
    bins = 20, alpha = 0.6
  ) +
  geom_histogram(
    data = subset(df, treat == 0),
    bins = 20, alpha = 0.6,
    aes(y = -after_stat(count)) # flip control group to negative y
  ) +
  labs(title = "Mirrored Propensity Score Distribution",
       x = "Propensity Score", y = "Count") +
  scale_fill_manual(values = c("gray", "mediumorchid"),
                    labels = c("Control", "Treated")) +
  theme_minimal()+
  theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
    axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
    axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
    legend.position = "top",
    legend.title = element_text(size = 14, face = "bold", family = "abel"),
    legend.text = element_text(size = 12, family = "abel"),
    axis.text.x = element_text(size = 12, family = "abel"),
    axis.text.y = element_text(size = 12, family = "abel")
))

set.seed(100)

psmatch<-Match(Tr=mydata$treat,M=1,X=pscore,replace=FALSE)
matched<-mydata[unlist(psmatch[c("index.treated","index.control")]),]
matchedtab<-CreateTableOne(vars=xvars,strata="treat",data=matched,test=FALSE)
print(matchedtab,smd=TRUE)

                                    Stratified by treat
                                     0             1             SMD   
  n                                    264           264               
  age (mean (SD))                    40.93 (11.23) 41.06 (11.39)  0.011
  mh_access (mean (SD))               0.48 (0.50)   0.48 (0.50)  <0.001
  software_engineer_role (mean (SD))  0.09 (0.29)   0.12 (0.33)   0.098
  sales_role (mean (SD))              0.14 (0.35)   0.13 (0.34)   0.044
  data_scientist_role (mean (SD))     0.16 (0.36)   0.15 (0.36)   0.021
  hr_role (mean (SD))                 0.16 (0.36)   0.16 (0.36)  <0.001
  designer_role (mean (SD))           0.18 (0.39)   0.17 (0.37)   0.040
  product_manager_role (mean (SD))    0.00 (0.00)   0.00 (0.00)  <0.001
  marketing_role (mean (SD))          0.11 (0.32)   0.12 (0.33)   0.023
  female (mean (SD))                  0.28 (0.45)   0.27 (0.44)   0.026
  male (mean (SD))                    0.28 (0.45)   0.25 (0.44)   0.060
  nonbinary (mean (SD))               0.24 (0.43)   0.23 (0.42)   0.027
  gender_prefernotsaid (mean (SD))    0.20 (0.40)   0.25 (0.43)   0.118
  weekly_exercise (mean (SD))         0.31 (0.47)   0.29 (0.45)   0.058
  daily_exercise (mean (SD))          0.36 (0.48)   0.39 (0.49)   0.047
  no_exercise (mean (SD))             0.32 (0.47)   0.33 (0.47)   0.008
  good_sleep (mean (SD))              0.34 (0.48)   0.34 (0.47)   0.016
  average_sleep (mean (SD))           0.33 (0.47)   0.31 (0.47)   0.024
  poor_sleep (mean (SD))              0.33 (0.47)   0.35 (0.48)   0.040
  num_virt_mtngs (mean (SD))          8.08 (4.68)   8.08 (4.66)   0.001
  weekly_hrs_worked (mean (SD))      40.02 (11.83) 39.89 (11.56)  0.011
  yrs_exp (mean (SD))                17.88 (10.16) 17.82 (10.02)  0.006

# Group treated vs. control
treated  <- mydata[psmatch$index.treated, ]
control  <- mydata[psmatch$index.control, ]

# Add ID for each treated/control pair
# Indices were already ordered in matching order for treatment & control in matching function
pair_id <- seq_along(psmatch$index.treated)
treated$pair_id <- pair_id
control$pair_id <- pair_id

# Add propensity score
treated$pscore <- pscore[psmatch$index.treated]
control$pscore <- pscore[psmatch$index.control]

matched_pairs <- merge(
  treated[, c("pair_id", "treat", "pscore")],
  control[, c("pair_id", "treat", "pscore")],
  by = "pair_id",
  suffixes = c(".treated", ".control")
)

head(matched_pairs)

matched_pairs_w_outcome <- merge(
  treated[, c("pair_id", "treat", "pscore", "work_life_balance_rating")],
  control[, c("pair_id", "treat", "pscore", "work_life_balance_rating")],
  by = "pair_id",
  suffixes = c(".treated", ".control")
)

head(matched_pairs_w_outcome)

treat_vec <- rep(1, length(psmatch$index.treated))
control_vec <- rep(0, length(psmatch$index.control))

df_matched <- data.frame(
  pair_id,
  treat   = c(treat_vec, control_vec),
  pscore  = c(pscore[psmatch$index.treated], pscore[psmatch$index.control])
)

head(df_matched)

ggplot(df_matched, aes(x = pscore, fill = factor(treat))) +
  geom_histogram(
    data = subset(df_matched, treat == 1),
    bins = 20, alpha = 0.6
  ) +
  geom_histogram(
    data = subset(df_matched, treat == 0),
    bins = 20, alpha = 0.6,
    aes(y = after_stat(-count))   # flip control group to negative y
  ) +
  labs(title = "Mirrored Propensity Score Distribution",
       x = "Propensity Score", y = "Count") +
  scale_fill_manual(values = c("gray", "mediumorchid"),
                    labels = c("Control", "Treated")) +
  theme_minimal()+
  theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
    axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
    axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
    legend.position = "top",
    legend.title = element_text(size = 14, face = "bold", family = "abel"),
    legend.text = element_text(size = 12, family = "abel"),
    axis.text.x = element_text(size = 12, family = "abel"),
    axis.text.y = element_text(size = 12, family = "abel")
)

matched$treat<-as.numeric(matched$treat==1)

y_trt_matched<-matched$work_life_balance_rating[matched$treat==1]
head(y_trt_matched)

y_con_matched<-matched$work_life_balance_rating[matched$treat==0]
head(y_con_matched)

length(y_con_matched)

mean(y_trt_matched)

mean(y_con_matched)

mean(y_trt_matched)-mean(y_con_matched)

diffy<-y_trt_matched-y_con_matched

print(head(diffy))

[1]  1  0  1 -2 -1 -2

t.test(diffy)

	One Sample t-test

data:  diffy
t = -1.2518, df = 263, p-value = 0.2117
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 -0.39958173  0.08897566
sample estimates:
mean of x 
-0.155303

# matched_pairs_w_outcome = dataframe of matched pairs from before
# y = matrix with treated and control outcomes per row

y <- cbind(
  matched_pairs_w_outcome$work_life_balance_rating.treated,
  matched_pairs_w_outcome$work_life_balance_rating.control
)

head(y)

library(sensitivitymv)

senmv(y, gamma = 1.5)

# for (g in seq(1, 3, 0.1)) {
#   cat("Gamma =", g, "\n")
#   print(senmv(y, gamma = g))
# }

# Per-pair differences
matched_pairs_w_outcome$diff <- matched_pairs_w_outcome$work_life_balance_rating.treated - matched_pairs_w_outcome$work_life_balance_rating.control

# Observed mean difference
obs_diff <- mean(matched_pairs_w_outcome$diff)
cat("Observed mean difference (treated - control):", obs_diff, "\n")

# Permutation test
set.seed(100)
n_iter <- 1000
perm_diffs <- numeric(n_iter)

for (i in 1:n_iter) {
  # Randomly flip the sign of the pair differences (simulates swapping treatment labels)
  flip <- sample(c(-1, 1), nrow(matched_pairs_w_outcome), replace = TRUE)
  perm_diffs[i] <- mean(matched_pairs_w_outcome$diff * flip)
}

# Observed vs. permutation distribution
p_value <- mean(abs(perm_diffs) >= abs(obs_diff))
cat("Permutation p-value:", p_value, "\n")

Observed mean difference (treated - control): -0.155303 
Permutation p-value: 0.209

perm_df <- data.frame(perm_diffs = perm_diffs)

suppressWarnings(ggplot(perm_df, aes(x = perm_diffs)) +
  geom_histogram(bins = 30, fill = "lightgray", color = "white") +
  geom_vline(xintercept = obs_diff, color = "mediumorchid", size = 2) +
  annotate("text", x = obs_diff,
           y = max(hist(perm_diffs, plot = FALSE)$counts),
           label = paste0("Observed =", round(obs_diff, 3)),
           hjust = -.1, color = "mediumorchid", size =6) +
  labs(
    title = "Permutation Distribution of Treatment Effect",
    x = "Difference in mean outcome (treated - control)",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold", family = "abel"),
    axis.title.x = element_text(size = 14, face = "bold", family = "abel"),
    axis.title.y = element_text(size = 14, face = "bold", family = "abel"),
    axis.text.x = element_text(size = 12, family = "abel"),
    axis.text.y = element_text(size = 12, family = "abel")
  ))

psm_optimal_optmatch <- function(data, treatment, xvars, outcome) {
  set.seed(100)

  # Distance matrix
  dist_mat <- match_on(as.formula(paste(treatment, "~ pscore")), data = data)

  # Optimal pair matching
  matches <- pairmatch(dist_mat, data = data)

  # Pair IDs as integers
  data$pair_id <- as.numeric(as.factor(matches))

  # Drop umatched units
  matched <- subset(data, !is.na(pair_id))

  # Numeric treatment indicator
  matched$treat_num <- as.numeric(matched[[treatment]] == 1)

  # SMD table (including max SMD)
  tab <- CreateTableOne(vars = xvars, strata = treatment, data = matched, test = FALSE)
  smd_vals <- ExtractSmd(tab)
  smd_vals[!is.finite(smd_vals)] <- NA
  max_smd <- if(length(smd_vals) == 0) NA else max(abs(smd_vals), na.rm = TRUE)

  # Matched pairs table
  treated <- matched[matched$treat_num == 1, ]
  control <- matched[matched$treat_num == 0, ]

  merged <- merge(
    treated[, c("pair_id", treatment, outcome, "pscore")],
    control[, c("pair_id", treatment, outcome, "pscore")],
    by = "pair_id",
    suffixes = c(".treated", ".control")
  )

  # Paired differences
  diffy <- merged[[paste0(outcome, ".treated")]] - merged[[paste0(outcome, ".control")]]

  treat_effect <- mean(diffy)
  p_value <- t.test(diffy)$p.value
  merged$observed_diff <- diffy

  # Output
  return(list(
    summary = data.frame(
      n_matched_pairs = nrow(merged),
      treat_effect = round(treat_effect, 3),
      p_value = p_value,
      max_smd = max_smd
    ),
    matched_pairs_details = merged,
    matched_long = matched[, c("pair_id", treatment, "pscore", outcome, xvars)]
  ))
}

result <- psm_optimal_optmatch(
  data = mydata,
  treatment = "treat",
  xvars = xvars,
  outcome = "work_life_balance_rating"
)

result$summary
head(result$matched_pairs_details)

	pair_id	treat.treated	pscore.treated	treat.control	pscore.control
	<int>	<dbl>	<dbl>	<dbl>	<dbl>
1	1	1	0.5510691	0	0.5501304
2	2	1	0.2419249	0	0.2414837
3	3	1	0.4735577	0	0.4743751
4	4	1	0.3018724	0	0.3017638
5	5	1	0.3577197	0	0.3589932
6	6	1	0.4750771	0	0.4743643

	pair_id	treat.treated	pscore.treated	work_life_balance_rating.treated	treat.control	pscore.control	work_life_balance_rating.control
	<int>	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<int>
1	1	1	0.5510691	3	0	0.5501304	2
2	2	1	0.2419249	2	0	0.2414837	2
3	3	1	0.4735577	2	0	0.4743751	1
4	4	1	0.3018724	3	0	0.3017638	5
5	5	1	0.3577197	2	0	0.3589932	3
6	6	1	0.4750771	2	0	0.4743643	4

n_matched_pairs	treat_effect	p_value	max_smd
<int>	<dbl>	<dbl>	<dbl>
264	-0.17	0.1743329	0.1084192

	pair_id	treat.treated	work_life_balance_rating.treated	pscore.treated	treat.control	work_life_balance_rating.control	pscore.control	observed_diff
	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<int>	<dbl>	<int>
1	1	1	3	0.5510691	0	5	0.5549256	-2
2	2	1	1	0.5162423	0	4	0.5179893	-3
3	3	1	1	0.2730339	0	2	0.2730537	-1
4	4	1	2	0.4900825	0	4	0.4829243	-2
5	5	1	2	0.2735915	0	5	0.2737418	-3
6	6	1	3	0.2424331	0	3	0.2405656	0

Install Packages¶

Load Libraries¶

Randomization/Permutation Tests¶

	Employee_ID	Age	Gender	Job_Role	Industry	Years_of_Experience	Work_Location	Hours_Worked_Per_Week	Number_of_Virtual_Meetings	Work_Life_Balance_Rating	Stress_Level	Mental_Health_Condition	Access_to_Mental_Health_Resources	Productivity_Change	Social_Isolation_Rating	Satisfaction_with_Remote_Work	Company_Support_for_Remote_Work	Physical_Activity	Sleep_Quality	Region
	<fct>	<int>	<fct>	<fct>	<fct>	<int>	<fct>	<int>	<int>	<int>	<fct>	<fct>	<fct>	<fct>	<int>	<fct>	<int>	<fct>	<fct>	<fct>
1	EMP0001	32	Non-binary	HR	Healthcare	13	Hybrid	47	7	2	Medium	Depression	No	Decrease	1	Unsatisfied	1	Weekly	Good	Europe
2	EMP0002	40	Female	Data Scientist	IT	3	Remote	52	4	1	Medium	Anxiety	No	Increase	3	Satisfied	2	Weekly	Good	Asia
3	EMP0003	59	Non-binary	Software Engineer	Education	22	Hybrid	46	11	5	Medium	Anxiety	No	No Change	4	Unsatisfied	5	None	Poor	North America
4	EMP0004	27	Male	Software Engineer	Finance	20	Onsite	32	8	4	High	Depression	Yes	Increase	3	Unsatisfied	3	None	Poor	Europe
5	EMP0005	49	Male	Sales	Consulting	32	Onsite	35	12	2	High	None	Yes	Decrease	3	Unsatisfied	3	Weekly	Average	North America
6	EMP0006	59	Non-binary	Sales	IT	31	Hybrid	39	3	4	High	None	No	Increase	5	Unsatisfied	1	None	Average	South America

	Job_Role
	<fct>
3	Marketing
585	Software Engineer
1043	Sales
1139	Designer
1151	Project Manager
1235	Data Scientist
1410	HR