!pip install lifelines

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines import WeibullFitter, LogNormalFitter, LogLogisticFitter, ExponentialFitter, GeneralizedGammaFitter
from lifelines import WeibullAFTFitter, LogNormalAFTFitter, LogLogisticAFTFitter, GeneralizedGammaRegressionFitter
from lifelines.utils import median_survival_times
from sklearn.model_selection import KFold
from lifelines.calibration import survival_probability_calibration
from lifelines import exceptions
import contextlib
import io
import math
import warnings

custom_palette = [
    '#dfbefb',
    '#C49C94',
    '#dfd4c9',
    '#FAC287',
    '#cb95f9',
    '#B3B3F2',
    '#895b3a',
    '#8383F7',
    '#C2C8FF',
    '#e7dbf3',
    '#CACACC',
    '#C0F0EF',
    '#101827',
    '#1bc2bb'
]

sns.palplot(custom_palette)
plt.show()

!wget "https://raw.githubusercontent.com/lindsayalexandra14/ds_portfolio/main/1_projects/machine_learning/time_series_and_survival/employee_churn/employee_churn.csv"

--2025-12-12 23:56:40--  https://raw.githubusercontent.com/lindsayalexandra14/ds_portfolio/main/1_projects/machine_learning/time_series_and_survival/employee_churn/employee_churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 775551 (757K) [text/plain]
Saving to: ‘employee_churn.csv.3’

employee_churn.csv. 100%[===================>] 757.37K  --.-KB/s    in 0.1s    

2025-12-12 23:56:40 (5.33 MB/s) - ‘employee_churn.csv.3’ saved [775551/775551]

df = pd.read_csv("employee_churn.csv")

df.shape

(14249, 10)

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14249 entries, 0 to 14248
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   avg_monthly_hrs    14249 non-null  int64  
 1   department         13540 non-null  object 
 2   filed_complaint    2058 non-null   float64
 3   last_evaluation    12717 non-null  float64
 4   n_projects         14249 non-null  int64  
 5   recently_promoted  300 non-null    float64
 6   salary             14249 non-null  object 
 7   satisfaction       14068 non-null  float64
 8   status             14249 non-null  object 
 9   tenure             14068 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.1+ MB

variables = {
    "avg_monthly_hours": "average monthly hours worked",
    "filed_complaint": '"1" for filed a work complaint (ever)',
    "last_evaluation": 'undefined - to be removed',
    "n_projects": "number of projects completed in past year",
    "recently_promoted": '"1" for promoted in past year',
    "satisfaction": "0-100, 100 for very satisfied",
    "tenure": "years at company"
}

for variable, definition in variables.items():
    print(f"{variable}: {definition}")

avg_monthly_hours: average monthly hours worked
filed_complaint: "1" for filed a work complaint (ever)
last_evaluation: undefined - to be removed
n_projects: number of projects completed in past year
recently_promoted: "1" for promoted in past year
satisfaction: 0-100, 100 for very satisfied
tenure: years at company

missing_df = pd.DataFrame({
    'Missing Count': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df)) * 100
})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)

df["recently_promoted"].unique()

array([nan,  1.])

df["filed_complaint"].unique()

array([nan,  1.])

df["recently_promoted"] = df["recently_promoted"].fillna(0)

df["filed_complaint"] = df["filed_complaint"].fillna(0)

df = df.dropna(subset=["department", "satisfaction", "tenure"])

df = df.drop("last_evaluation", axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13359 entries, 0 to 14247
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   avg_monthly_hrs    13359 non-null  int64  
 1   department         13359 non-null  object 
 2   filed_complaint    13359 non-null  float64
 3   n_projects         13359 non-null  int64  
 4   recently_promoted  13359 non-null  float64
 5   salary             13359 non-null  object 
 6   satisfaction       13359 non-null  float64
 7   status             13359 non-null  object 
 8   tenure             13359 non-null  float64
dtypes: float64(4), int64(2), object(3)
memory usage: 1.0+ MB

df["churn"] = np.where(df["status"] == "Employed", 0, 1)

df.drop("status", axis=1, inplace=True)

df.rename(columns={"tenure": "tenure_years"}, inplace=True)

df.head()

df.describe()

df['churn'].sum()

np.int64(3161)

numeric_columns = ['avg_monthly_hrs', 'n_projects','tenure_years']

for col in numeric_columns:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1

  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  print(col,"outliers:", len(outliers))
  print(col, "outlier values:", np.sort(outliers[col].unique()))

avg_monthly_hrs outliers: 0
avg_monthly_hrs outlier values: []
n_projects outliers: 0
n_projects outlier values: []
tenure_years outliers: 1154
tenure_years outlier values: [ 6.  7.  8. 10.]

sns.histplot(df["tenure_years"], color="#101827")
plt.show()

palette_subset = custom_palette[:2]

sns.barplot(
    x='churn',
    y='tenure_years',
    data=df,
    hue='churn',
    palette=palette_subset
)

plt.show()

palette_subset = custom_palette[:2]

sns.histplot(
    data=df,
    x="tenure_years",
    hue="churn",
    palette=palette_subset,
    bins=20,
    multiple="dodge",
    shrink=0.8
)

plt.show()

print(f"Average tenure: {df['tenure_years'].mean():.2f} years")

Average tenure: 3.50 years

print(f"Average tenure for churned: {df.loc[df["churn"] == 1, "tenure_years"].mean():.2f} years")

Average tenure for churned: 3.88 years

print(f"Average tenure for non-churned: {df.loc[df["churn"] == 0, "tenure_years"].mean():.2f} years")

Average tenure for non-churned: 3.39 years

df['department'] = df['department'].replace({'information_technology': 'IT'})

dept_churn = df.groupby("department")["churn"].mean().reset_index()
dept_churn["churn"] = dept_churn["churn"].round(2)

df['department'].unique()

array(['engineering', 'support', 'sales', 'IT', 'product', 'marketing',
       'procurement', 'finance', 'management', 'admin'], dtype=object)

palette_subset = custom_palette[:10]

ax=sns.barplot(
    data=dept_churn,
    x="department",
    y="churn",
    palette=palette_subset,
    hue="department",
    legend=False
)

for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height*100:.0f}%",
        (p.get_x() + p.get_width() / 2, height),
        ha="center",
        va="bottom",
        fontsize=9,
        xytext=(0, 3),
        textcoords="offset points"
    )

plt.xticks(rotation=45)
plt.ylabel("Average Churn Rate")
plt.title("Average Churn Rate by Department")
plt.show()

df.head()

bins = [0.0, 0.33, 0.66, 1.0]
labels = ['low', 'medium', 'high']

df['satisfaction'] = pd.cut(df['satisfaction'], bins=bins, labels=labels, include_lowest=True)

print(df['satisfaction'].head())

0      high
1      high
2      high
3    medium
4      high
Name: satisfaction, dtype: category
Categories (3, object): ['low' < 'medium' < 'high']

df = pd.get_dummies(df,columns=['department'], drop_first=False)

df = df.drop(columns=['department_support'])

df = pd.get_dummies(df, drop_first=True)

df.rename(columns={'filed_complaint_Yes':'Filed Complaint'}, inplace=True)
df.rename(columns={'recently_promoted_Yes':'Recently Promoted'}, inplace=True)

kmf = KaplanMeierFitter()
kmf.fit(durations=df['tenure_years'], event_observed=df['churn'])

plt.figure(figsize=(10,6))
kmf.plot_survival_function(ci_show=True, lw=2, color='black')

event_table = kmf.event_table

for t, n_risk, obs in zip(event_table.index, event_table['at_risk'], event_table['observed']):
    if obs > 0:
        churn_percentage = (obs / n_risk) * 100
        plt.text(t, 0.2, f"Time: {t:.1f}, {int(obs)}/{int(n_risk)} ({churn_percentage:.1f}%) ", rotation=45, fontsize=8, color='seagreen')

plt.title("KM Survival Curve: Employee Churn")
plt.xlabel("Time")
plt.ylabel("Survival Probability")
plt.show()

event_table['percent_lost']=event_table['observed']/event_table['at_risk']
event_table['percent_lost']=event_table['percent_lost'].round(2)*100

event_table["conditional_survival"] = 1 - (event_table["observed"] / event_table["at_risk"])
event_table["cumulative_survival"] = event_table["conditional_survival"].cumprod()

event_table = event_table.reset_index()

event_table = event_table[[
    "event_at", "at_risk", "observed", "censored", "removed",
    "conditional_survival","cumulative_survival"
]]

print(event_table)

   event_at  at_risk  observed  censored  removed  conditional_survival  \
0       0.0    13359         0         0        0              1.000000   
1       2.0    13359        48      2842     2890              0.996407   
2       3.0    10469      1398      4333     5731              0.866463   
3       4.0     4738       794      1491     2285              0.832419   
4       5.0     2453       742       557     1299              0.697513   
5       6.0     1154       179       444      623              0.844887   
6       7.0      531         0       182      182              1.000000   
7       8.0      349         0       149      149              1.000000   
8      10.0      200         0       200      200              1.000000   

   cumulative_survival  
0             1.000000  
1             0.996407  
2             0.863350  
3             0.718668  
4             0.501281  
5             0.423526  
6             0.423526  
7             0.423526  
8             0.423526

hazard = event_table['observed'] / event_table['at_risk']

peak_time = hazard.idxmax()
print("Peak churn risk occurs at time:", peak_time)

Peak churn risk occurs at time: 4

median_ = kmf.median_survival_time_
median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
print(median_)
print(median_confidence_interval_)

6.0
     KM_estimate_lower_0.95  KM_estimate_upper_0.95
0.5                     5.0                     6.0

cph = CoxPHFitter()
cph.fit(df, duration_col='tenure_years', event_col='churn')
cph.print_summary(style='ascii')

<lifelines.CoxPHFitter: fitted with 13359 total observations, 10198 right-censored observations>
             duration col = 'tenure_years'
                event col = 'churn'
      baseline estimation = breslow
   number of observations = 13359
number of events observed = 3161
   partial log-likelihood = -25912.98
         time fit was run = 2025-12-12 23:56:44 UTC

---
                        coef exp(coef)  se(coef)  coef lower 95%  coef upper 95% exp(coef) lower 95% exp(coef) upper 95%
covariate                                                                                                               
avg_monthly_hrs         0.00      1.00      0.00            0.00            0.00                1.00                1.00
filed_complaint        -1.28      0.28      0.08           -1.44           -1.11                0.24                0.33
n_projects             -0.21      0.81      0.02           -0.24           -0.18                0.78                0.83
recently_promoted      -1.34      0.26      0.24           -1.81           -0.87                0.16                0.42
department_IT          -0.21      0.81      0.07           -0.36           -0.07                0.70                0.93
department_admin       -0.35      0.70      0.17           -0.69           -0.01                0.50                0.99
department_engineering  0.03      1.03      0.06           -0.09            0.14                0.91                1.15
department_finance     -0.10      0.90      0.08           -0.27            0.06                0.77                1.07
department_management  -0.31      0.73      0.12           -0.55           -0.08                0.58                0.92
department_marketing   -0.14      0.87      0.08           -0.30            0.03                0.74                1.03
department_procurement -0.98      0.38      0.25           -1.47           -0.48                0.23                0.62
department_product     -0.18      0.83      0.08           -0.35           -0.02                0.71                0.98
department_sales       -0.10      0.90      0.05           -0.21            0.00                0.81                1.00
salary_low              1.62      5.04      0.12            1.38            1.86                3.97                6.40
salary_medium           1.13      3.09      0.12            0.89            1.37                2.43                3.93
satisfaction_medium     0.01      1.01      0.05           -0.09            0.11                0.92                1.11
satisfaction_high      -1.11      0.33      0.05           -1.21           -1.01                0.30                0.36

                        cmp to      z      p  -log2(p)
covariate                                             
avg_monthly_hrs           0.00   5.45 <0.005     24.25
filed_complaint           0.00 -15.10 <0.005    168.69
n_projects                0.00 -13.18 <0.005    129.39
recently_promoted         0.00  -5.63 <0.005     25.69
department_IT             0.00  -2.94 <0.005      8.27
department_admin          0.00  -2.01   0.04      4.48
department_engineering    0.00   0.43   0.67      0.58
department_finance        0.00  -1.20   0.23      2.11
department_management     0.00  -2.66   0.01      7.01
department_marketing      0.00  -1.61   0.11      3.21
department_procurement    0.00  -3.85 <0.005     13.03
department_product        0.00  -2.14   0.03      4.95
department_sales          0.00  -1.87   0.06      4.04
salary_low                0.00  13.24 <0.005    130.46
salary_medium             0.00   9.15 <0.005     63.92
satisfaction_medium       0.00   0.21   0.84      0.26
satisfaction_high         0.00 -22.35 <0.005    365.03
---
Concordance = 0.82
Partial AIC = 51859.96
log-likelihood ratio test = 1896.40 on 17 df
-log2(p) of ll-ratio test = inf

cph.baseline_hazard_

baseline_hazard = cph.baseline_hazard_
peak_time = baseline_hazard.idxmax().iloc[0]
print("Peak churn risk occurs at time:", peak_time)

Peak churn risk occurs at time: 5.0

cph.plot()

<Axes: xlabel='log(HR) (95% CI)'>

covariates = ['filed_complaint', 'recently_promoted', 'salary_low', 'salary_medium', 'satisfaction_medium', 'satisfaction_high']
colors_list = [custom_palette[10:12], custom_palette[2:8], custom_palette[3:9],
               custom_palette[4:10], custom_palette[5:11], custom_palette[4:7]]

fig, axes = plt.subplots(2, 3, figsize=(18,10))

axes = axes.flatten()

for i, cov in enumerate(covariates):
    ax = axes[i]
    cph.plot_partial_effects_on_outcome(
        cov,
        [0, 1],
        plot_baseline=False,
        lw=4,
        color=colors_list[i],
        ax=ax
    )
    ax.set_xlabel("event_at")
    ax.set_ylabel("Predicted Survival Probability")
    ax.set_title(cov)

plt.tight_layout()
plt.show()

# cph.check_assumptions(df, p_value_threshold=0.05, show_plots=True)

kmf = KaplanMeierFitter()
T = df['tenure_years']
E = df['churn']
kmf.fit(T, event_observed=E, label="Kaplan-Meier")

wb = WeibullFitter().fit(T, event_observed=E, label="Weibull")
lnf = LogNormalFitter().fit(T, event_observed=E, label="Log-normal")
llf = LogLogisticFitter().fit(T, event_observed=E, label="Log-logistic")
exp = ExponentialFitter().fit(T, event_observed=E, label="Exponential")
gg = GeneralizedGammaFitter().fit(T, event_observed=E, label="Generalized Gamma")

t_grid = np.linspace(T.min(), T.max(), 200)

fig, ax = plt.subplots(figsize=(7, 6))

kmf.plot_survival_function(ax=ax, ci_show=False, color=custom_palette[12])

models = [
    (wb, "Weibull (univariate)", 1),
    (lnf, "Log-normal (univariate)", 2),
    (llf, "Log-logistic (univariate)", 3),
    (exp, "Exponential (univariate)", 4),
    (gg, "Generalized Gamma (univariate)", 5)
]

for model, label, color_idx in models:
    ax.plot(t_grid, model.survival_function_at_times(t_grid).values.flatten(),
            label=label, color=custom_palette[color_idx])

ax.set_xlabel("Tenure (years)")
ax.set_ylabel("Survival probability")
ax.set_title("KM vs parametric univariate survival curves")
ax.legend()
plt.show()

T = df['tenure_years']
E = df['churn']

kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E, label="Kaplan-Meier")

aft_models = {
    "Weibull (AFT)": WeibullAFTFitter(),
    "Log-Logistic (AFT)": LogLogisticAFTFitter(),
    "Log-Normal (AFT)": LogNormalAFTFitter()
}

gamma_model = GeneralizedGammaRegressionFitter(penalizer=0.5)
gamma_model._scipy_fit_method = "SLSQP"

for name, model in aft_models.items():
    model.fit(df, duration_col='tenure_years', event_col='churn')

gamma_model.fit(df, duration_col='tenure_years', event_col='churn')

t_grid = np.linspace(T.min(), T.max(), 200)

fig, ax = plt.subplots(figsize=(8, 6))

kmf.plot_survival_function(ax=ax, ci_show=False, color=custom_palette[12])

for i, (name, model) in enumerate(aft_models.items()):
    surv_df = model.predict_survival_function(df)
    surv_mean = np.interp(t_grid, surv_df.index.values, surv_df.mean(axis=1))
    ax.plot(t_grid, surv_mean, label=name, color=custom_palette[i+1])

gamma_surv_df = gamma_model.predict_survival_function(df)
gamma_surv_mean = np.interp(t_grid, gamma_surv_df.index.values, gamma_surv_df.mean(axis=1))
ax.plot(t_grid, gamma_surv_mean, label="Generalized Gamma (Regression)",  color=custom_palette[4])

ax.set_xlabel("Tenure (years)")
ax.set_ylabel("Survival probability")
ax.set_title("Kaplan-Meier vs Parametric AFT Survival Curves")
ax.legend()
plt.show()

from lifelines import (
    WeibullFitter,
    LogNormalFitter,
    LogLogisticFitter,
    ExponentialFitter,
    GeneralizedGammaFitter,
    WeibullAFTFitter,
    LogNormalAFTFitter,
    LogLogisticAFTFitter,
    GeneralizedGammaRegressionFitter
)

T = df['tenure_years']
E = df['churn']

models_univariate = {
    "Weibull_uni": WeibullFitter(),
    "LogNormal_uni": LogNormalFitter(),
    "LogLogistic_uni": LogLogisticFitter(),
    "Exponential_uni": ExponentialFitter(),
    "Gamma_uni": GeneralizedGammaFitter()
}

aic_univariate = {}

print("\n====== UNIVARIATE BASELINE MODELS ======")
for name, model in models_univariate.items():
    model.fit(T, event_observed=E)
    aic_univariate[name] = model.AIC_
    print(f"{name:18s} | AIC = {model.AIC_:10.3f}   LogLik = {model.log_likelihood_:10.3f}")

models_aft = {
    "Weibull_AFT": WeibullAFTFitter(),
    "LogNormal_AFT": LogNormalAFTFitter(),
    "LogLogistic_AFT": LogLogisticAFTFitter(),
    "Gamma_AFT": GeneralizedGammaRegressionFitter(penalizer=0.5)
}

# Gamma regression required SLSQP in order to converge
models_aft["Gamma_AFT"]._scipy_fit_method = "SLSQP"

aic_aft = {}

print("\n====== AFT REGRESSION MODELS (WITH COVARIATES) ======")
for name, model in models_aft.items():
    model.fit(df, duration_col="tenure_years", event_col="churn")
    aic_aft[name] = model.AIC_
    print(f"{name:18s} | AIC = {model.AIC_:10.3f}   LogLik = {model.log_likelihood_:10.3f}")

best_uni = min(aic_univariate, key=aic_univariate.get)
best_aft = min(aic_aft, key=aic_aft.get)

print("\n====== MODEL SELECTION SUMMARY ======")
print(f"Best Univariate Baseline Model:  {best_uni}  (AIC = {aic_univariate[best_uni]:.2f})")
print(f"Best AFT Regression Model:      {best_aft}  (AIC = {aic_aft[best_aft]:.2f})")

====== UNIVARIATE BASELINE MODELS ======
Weibull_uni        | AIC =  18757.579   LogLik =  -9376.790
LogNormal_uni      | AIC =  17209.244   LogLik =  -8602.622
LogLogistic_uni    | AIC =  17546.613   LogLik =  -8771.306
Exponential_uni    | AIC =  23363.378   LogLik = -11680.689
Gamma_uni          | AIC =  16199.551   LogLik =  -8096.776

====== AFT REGRESSION MODELS (WITH COVARIATES) ======
Weibull_AFT        | AIC =  16684.557   LogLik =  -8323.279
LogNormal_AFT      | AIC =  14676.389   LogLik =  -7319.195
LogLogistic_AFT    | AIC =  14583.298   LogLik =  -7272.649
Gamma_AFT          | AIC =  17597.853   LogLik =  -8747.926

====== MODEL SELECTION SUMMARY ======
Best Univariate Baseline Model:  Gamma_uni  (AIC = 16199.55)
Best AFT Regression Model:      LogLogistic_AFT  (AIC = 14583.30)

warnings.filterwarnings("ignore", category=exceptions.StatisticalWarning)
# Supressing lengthy warning that came from 0.5 penalizer models

penalizers = [0.0, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1]
l1_ratios = [0.0, 0.5, 1.0]

results = {}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for penalizer in penalizers:
    for l1_ratio in l1_ratios:
        aft_cv = LogLogisticAFTFitter(penalizer=penalizer, l1_ratio=l1_ratio)
        cv_scores = []
        for train_idx, val_idx in kf.split(df):
            train = df.iloc[train_idx]
            val = df.iloc[val_idx]
            aft_cv.fit(train, 'tenure_years', 'churn')
            cv_scores.append(aft_cv.concordance_index_)
        cv_concordance = np.mean(cv_scores)

        aft_full = LogLogisticAFTFitter(penalizer=penalizer, l1_ratio=l1_ratio)
        aft_full.fit(df, 'tenure_years', 'churn')
        aic = aft_full.AIC_

        results[(penalizer, l1_ratio)] = {'concordance': cv_concordance, 'AIC': aic}
        print(f"penalizer={penalizer}, l1_ratio={l1_ratio}: C={cv_concordance:.3f}, AIC={aic:.1f}")

penalizer=0.0, l1_ratio=0.0: C=0.834, AIC=14583.3
penalizer=0.0, l1_ratio=0.5: C=0.834, AIC=14583.3
penalizer=0.0, l1_ratio=1.0: C=0.834, AIC=14583.3
penalizer=0.001, l1_ratio=0.0: C=0.834, AIC=14626.5
penalizer=0.001, l1_ratio=0.5: C=0.834, AIC=14642.0
penalizer=0.001, l1_ratio=1.0: C=0.834, AIC=14657.4
penalizer=0.01, l1_ratio=0.0: C=0.834, AIC=15005.2
penalizer=0.01, l1_ratio=0.5: C=0.835, AIC=15157.8
penalizer=0.01, l1_ratio=1.0: C=0.836, AIC=15300.9
penalizer=0.05, l1_ratio=0.0: C=0.834, AIC=16487.0
penalizer=0.05, l1_ratio=0.5: C=0.835, AIC=17207.4
penalizer=0.05, l1_ratio=1.0: C=0.832, AIC=17765.7
penalizer=0.1, l1_ratio=0.0: C=0.831, AIC=17936.6
penalizer=0.1, l1_ratio=0.5: C=0.832, AIC=19277.3
penalizer=0.1, l1_ratio=1.0: C=0.833, AIC=20327.6
penalizer=0.5, l1_ratio=0.0: C=0.804, AIC=22112.1
penalizer=0.5, l1_ratio=0.5: C=0.698, AIC=24133.3
penalizer=0.5, l1_ratio=1.0: C=0.714, AIC=27222.2

aft = LogLogisticAFTFitter(penalizer=0.01, l1_ratio=1)
aft.fit(df, 'tenure_years', 'churn')

aft.print_summary(columns=['coef', 'exp(coef)'])

alphas = aft.summary.loc['alpha_']['coef']
selected_vars = [var for var in alphas[abs(alphas) >= 0.03].index.tolist() if var != 'Intercept']
print(f"Selected significant variables: {selected_vars}")

Selected significant variables: ['department_management', 'department_procurement', 'filed_complaint', 'n_projects', 'recently_promoted', 'salary_low', 'salary_medium', 'satisfaction_high']

aft_unbiased = LogLogisticAFTFitter(penalizer=0.0)

columns_for_fit = selected_vars + ['tenure_years', 'churn']
aft_unbiased.fit(df[columns_for_fit], 'tenure_years', 'churn')

aft_unbiased.print_summary(columns=['coef', 'exp(coef)', 'se(coef)', 'p'])

aft_unbiased_summary = aft_unbiased.summary.loc['alpha_']

p_col = 'p' if 'p' in aft_unbiased_summary.columns else 'p-value'
summary_subset = aft_unbiased_summary[['exp(coef)', p_col]].rename(columns={p_col: 'p'})

summary_subset['significant'] = summary_subset['p'] < 0.05

increases = summary_subset[summary_subset['exp(coef)'] > 1].sort_values(by='exp(coef)', ascending=False)
decreases = summary_subset[summary_subset['exp(coef)'] <= 1].sort_values(by='exp(coef)', ascending=True)

print("=== Time Ratios (Highest to Lowest) ===")
print("When covariate increases by 1, the expected survival duration changes by a factor of ____ :\n")

print("--- >1: Increases Survival ---")
print(increases)

print("\n--- <1: Decreases Survival ---")
print(decreases)

=== Time Ratios (Highest to Lowest) ===
When covariate increases by 1, the expected survival duration changes by a factor of ____ :

--- >1: Increases Survival ---
                        exp(coef)              p  significant
covariate                                                    
Intercept                4.926868   0.000000e+00         True
recently_promoted        1.523309   1.767734e-14         True
satisfaction_high        1.427999   0.000000e+00         True
filed_complaint          1.403079   3.618804e-69         True
department_procurement   1.202124   6.650798e-04         True
department_management    1.188153   1.350199e-10         True
n_projects               1.064590  1.516767e-127         True

--- <1: Decreases Survival ---
               exp(coef)             p  significant
covariate                                          
salary_low      0.622253  2.333843e-56         True
salary_medium   0.691152  2.339357e-34         True

aft_unbiased.plot()

<Axes: xlabel='log(accelerated failure rate) (95% CI)'>

binary_covs = [
    c for c in selected_vars
    if ((df[c].dropna().isin([0, 1]).all()) or (df[c].dtype == 'bool'))
]

print("Binary covariates to plot:", binary_covs)

base_profile_data = {}
for col in selected_vars:
    if df[col].dtype in ['int64', 'float64']:
        base_profile_data[col] = df[col].mean()
    elif df[col].dtype == 'bool':
        base_profile_data[col] = 0
    else:
        base_profile_data[col] = df[col].mode()[0]

base_profile_df = pd.DataFrame([base_profile_data])
base_profile_df = base_profile_df[selected_vars]

n = len(binary_covs)
cols = 3
rows = math.ceil(n / cols)

fig, axes = plt.subplots(rows, cols, figsize=(18, 5 * rows))
axes = axes.flatten()

for i, cov in enumerate(binary_covs):
    ax = axes[i]

    profile0 = base_profile_df.copy()
    profile1 = base_profile_df.copy()
    profile0[cov] = 0.0
    profile1[cov] = 1.0

    sf0 = aft_unbiased.predict_survival_function(profile0)
    sf1 = aft_unbiased.predict_survival_function(profile1)

    sf0.columns = [f"{cov}=0"]
    sf1.columns = [f"{cov}=1"]

    ax.plot(sf0.index, sf0.iloc[:, 0],
            label=f"{cov}=0", color=custom_palette[i+1], linestyle="--")
    ax.plot(sf1.index, sf1.iloc[:, 0],
            label=f"{cov}=1", color=custom_palette[i+1])

    ax.set_title(f"AFT Survival Curves: {cov}")
    ax.set_xlabel("Time")
    ax.set_ylabel("Survival Probability")
    ax.legend()

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

Binary covariates to plot: ['department_management', 'department_procurement', 'filed_complaint', 'recently_promoted', 'salary_low', 'salary_medium', 'satisfaction_high']

X = df[selected_vars]

censored = df[~df['churn'].astype(bool)]
uncensored = df[df['churn'].astype(bool)]

X_cens = censored[selected_vars]
X_uncens = uncensored[selected_vars]

cens_medians = aft_unbiased.predict_median(X_cens)
uncens_medians = aft_unbiased.predict_median(X_uncens)

cens_group_median = cens_medians.mean()
uncens_group_median = uncens_medians.mean()

print(f"Censored median total time: {cens_group_median:.2f} years")
print(f"Uncensored median total time: {uncens_group_median:.2f} years")
print(f"Full group median total time: {aft_unbiased.predict_median(X).mean():.2f} years")

Censored median total time: 5.91 years
Uncensored median total time: 4.76 years
Full group median total time: 5.64 years

censored_subjects_last_obs = censored['tenure_years']

censored_survival_function = aft_unbiased.predict_survival_function(censored, conditional_after=censored_subjects_last_obs)

group_survival_function = censored_survival_function.mean(axis=1)

group_median_remaining = group_survival_function[group_survival_function <= 0.5].index[0]
print(f"Group median remaining time: {group_median_remaining:.2f} years")

Group median remaining time: 3.00 years

cens_sf = aft_unbiased.predict_survival_function(X_cens)
uncens_sf = aft_unbiased.predict_survival_function(X_uncens)
full_sf = aft_unbiased.predict_survival_function(X)

cens_group_sf = cens_sf.mean(axis=1)
uncens_group_sf = uncens_sf.mean(axis=1)
full_group_sf = full_sf.mean(axis=1)

comparison_table = pd.DataFrame({
    'Censored_only': cens_group_sf,
    'Uncensored_only': uncens_group_sf,
    'Full_group': full_group_sf
})

print(comparison_table.head(10))

      Censored_only  Uncensored_only  Full_group
2.0        0.993519         0.984099    0.991290
3.0        0.941740         0.869852    0.924730
4.0        0.800219         0.634083    0.760908
5.0        0.611679         0.410199    0.564005
6.0        0.432441         0.247598    0.388704
7.0        0.292695         0.143277    0.257340
8.0        0.195930         0.082330    0.169050
10.0       0.089306         0.028911    0.075016

fig, ax = plt.subplots(figsize=(10, 6))
full_group_sf.plot(ax=ax, label="Full group", color=custom_palette[12], linewidth=2)
cens_group_sf.plot(ax=ax, label="Censored only", color=custom_palette[1])
uncens_group_sf.plot(ax=ax, label="Uncensored only", color=custom_palette[5])
ax.set_xlabel("Time (years)")
ax.set_ylabel("Average survival probability")
ax.set_title("Group survival functions")
ax.legend()
plt.show()

metrics = {}
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.ravel()

# Hide initial printed output
f = io.StringIO()

for i, t0 in enumerate(range(1, 11)):
    with contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):
        ax_plot, ici, e50 = survival_probability_calibration(
            aft_unbiased,
            df[selected_vars + ['tenure_years', 'churn']],
            t0=t0,
            ax=axes[i]
        )

    metrics[t0] = {'ICI': round(ici, 3), 'E50': round(e50, 3)}
    axes[i].set_title(
        f't0={t0} (ICI={metrics[t0]["ICI"]}, E50={metrics[t0]["E50"]})'
    )

plt.tight_layout()
plt.show()

for t0, vals in metrics.items():
    print(f"t0={t0}: ICI={vals['ICI']}, E50={vals['E50']}")

t0=1: ICI=0.003, E50=0.002
t0=2: ICI=0.019, E50=0.015
t0=3: ICI=0.029, E50=0.03
t0=4: ICI=0.07, E50=0.035
t0=5: ICI=0.097, E50=0.055
t0=6: ICI=0.113, E50=0.124
t0=7: ICI=0.109, E50=0.114
t0=8: ICI=0.093, E50=0.095
t0=9: ICI=0.234, E50=0.254
t0=10: ICI=0.066, E50=0.015

	Missing Count	Missing %
recently_promoted	13949	97.894589
filed_complaint	12191	85.556881
last_evaluation	1532	10.751632
department	709	4.975788
satisfaction	181	1.270265
tenure	181	1.270265

	avg_monthly_hrs	filed_complaint	n_projects	recently_promoted	satisfaction	tenure_years	churn
count	13359.000000	13359.000000	13359.000000	13359.000000	13359.000000	13359.000000	13359.000000
mean	201.203159	0.145071	3.807845	0.021334	0.622194	3.504155	0.236620
std	49.930370	0.352185	1.235721	0.144500	0.250476	1.473922	0.425023
min	96.000000	0.000000	2.000000	0.000000	0.040058	2.000000	0.000000
25%	156.000000	0.000000	3.000000	0.000000	0.451782	3.000000	0.000000
50%	200.000000	0.000000	4.000000	0.000000	0.654061	3.000000	0.000000
75%	245.000000	0.000000	5.000000	0.000000	0.825668	4.000000	0.000000
max	310.000000	1.000000	7.000000	1.000000	1.000000	10.000000	1.000000

Install Packages¶

Load Libraries¶

Prediction Accuracy¶

	avg_monthly_hrs	department	filed_complaint	last_evaluation	n_projects	recently_promoted	salary	satisfaction	status	tenure
0	221	engineering	NaN	0.932868	4	NaN	low	0.829896	Left	5.0
1	232	support	NaN	NaN	3	NaN	low	0.834544	Employed	2.0
2	184	sales	NaN	0.788830	3	NaN	medium	0.834988	Employed	3.0
3	206	sales	NaN	0.575688	4	NaN	low	0.424764	Employed	2.0
4	249	sales	NaN	0.845217	3	NaN	low	0.779043	Employed	3.0

	baseline hazard
2.0	0.002461
3.0	0.088469
4.0	0.130548
5.0	0.266752
6.0	0.140777
7.0	0.000000
8.0	0.000000
10.0	0.000000

model	lifelines.LogLogisticAFTFitter
duration col	'tenure_years'
event col	'churn'
penalizer	0.01
number of observations	13359
number of events observed	3161
log-likelihood	-7631.43
time fit was run	2025-12-13 00:04:20 UTC

		coef	exp(coef)
alpha_	avg_monthly_hrs	-0.00	1.00
	department_IT	0.02	1.02
	department_admin	0.02	1.02
	department_engineering	-0.02	0.98
	department_finance	0.00	1.00
	department_management	0.18	1.20
	department_marketing	0.01	1.01
	department_procurement	0.15	1.16
	department_product	0.01	1.01
	department_sales	0.01	1.01
	filed_complaint	0.32	1.38
	n_projects	0.07	1.07
	recently_promoted	0.39	1.47
	salary_low	-0.38	0.69
	salary_medium	-0.27	0.76
	satisfaction_high	0.35	1.43
	satisfaction_medium	-0.01	0.99
	Intercept	1.57	4.80
beta_	Intercept	1.73	5.66

Concordance	0.84
AIC	15300.87
log-likelihood ratio test	2279.75 on 17 df
-log2(p) of ll-ratio test	inf

		coef	exp(coef)	se(coef)	p
alpha_	department_management	0.17	1.19	0.03	<0.005
	department_procurement	0.18	1.20	0.05	<0.005
	filed_complaint	0.34	1.40	0.02	<0.005
	n_projects	0.06	1.06	0.00	<0.005
	recently_promoted	0.42	1.52	0.05	<0.005
	salary_low	-0.47	0.62	0.03	<0.005
	salary_medium	-0.37	0.69	0.03	<0.005
	satisfaction_high	0.36	1.43	0.01	<0.005
	Intercept	1.59	4.93	0.03	<0.005
beta_	Intercept	1.75	5.78	0.01	<0.005

Concordance	0.84
AIC	14619.89
log-likelihood ratio test	2942.72 on 8 df
-log2(p) of ll-ratio test	inf