Propensity Score Stratification


Last updated


Last updated
# dowhy.causal_estimators.propensity_score_stratification_estimator _estimate_effect 함수 코드 일부
# 각 개체를 성향점수 오름차순으로 정렬 후 strata 지정
num_rows = self._data[self._outcome_name].shape[0]
self._data['strata'] = (
(self._data['propensity_score'].rank(ascending=True) / num_rows) * self.num_strata
).round(0)
self._data['dbar'] = 1 - self._data[self._treatment_name[0]] # 비교집단일 경우 1(True)
self._data['d_y'] = self._data[self._treatment_name[0]] * self._data[self._outcome_name] # 처치집단의 outcome
self._data['dbar_y'] = self._data['dbar'] * self._data[self._outcome_name] # 비교집단의 outcome
# clipping_threshold 보다 적은 처치집단 혹은 비교집단이 있는 strata 는 제외
stratified = self._data.groupby('strata')
clipped = stratified.filter(
lambda strata: min(strata.loc[strata[self._treatment_name[0]] == 1].shape[0],
strata.loc[strata[self._treatment_name[0]] == 0].shape[0]) > self.clipping_threshold
)
self.logger.debug("After using clipping_threshold={0}, here are the number of data points in each strata:\n {1}".format(self.clipping_threshold, clipped.groupby(['strata',self._treatment_name[0]])[self._outcome_name].count()))
if clipped.empty:
raise ValueError("Method requires strata with number of data points per treatment > clipping_threshold (={0}). No such strata exists. Consider decreasing 'num_strata' or 'clipping_threshold' parameters.".format(self.clipping_threshold))
# 각 strata 별로 처치집단 혹은 비교집단의 outcome 가중합 (비교집단 개체 수에 대해)
weighted_outcomes = clipped.groupby('strata').agg({
self._treatment_name[0]: ['sum'],
'dbar': ['sum'],
'd_y': ['sum'],
'dbar_y': ['sum']
})
weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.ravel()]
treatment_sum_name = self._treatment_name[0] + "_sum"
control_sum_name = "dbar_sum"
weighted_outcomes['d_y_mean'] = weighted_outcomes['d_y_sum'] / weighted_outcomes[treatment_sum_name] # 처치집단의 평균 outcome
weighted_outcomes['dbar_y_mean'] = weighted_outcomes['dbar_y_sum'] / weighted_outcomes['dbar_sum'] # 비교집단의 평균 outcome
weighted_outcomes['effect'] = weighted_outcomes['d_y_mean'] - weighted_outcomes['dbar_y_mean'] # 처치집단의 평균 outcome - 비교집단의 평균 outcome
total_treatment_population = weighted_outcomes[treatment_sum_name].sum() # 처치집단 개체 수
total_control_population = weighted_outcomes[control_sum_name].sum() # 비교집단 개체 수
total_population = total_treatment_population + total_control_population # 전체 개체 수
self.logger.debug("Total number of data points is {0}, including {1} from treatment and {2} from control.". format(total_population, total_treatment_population, total_control_population))
if self._target_units=="att": # ATT 계산
est = (weighted_outcomes['effect'] * weighted_outcomes[treatment_sum_name]).sum() / total_treatment_population
elif self._target_units=="atc": # ATC 계산
est = (weighted_outcomes['effect'] * weighted_outcomes[control_sum_name]).sum() / total_control_population
elif self._target_units == "ate": # ATE 계산
est = (weighted_outcomes['effect'] * (weighted_outcomes[control_sum_name]+weighted_outcomes[treatment_sum_name])).sum() / total_population
else:
raise ValueError("Target units string value not supported")