Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import statsmodels.formula.api as smf | |
from linearmodels.iv import IV2SLS | |
from scipy import stats | |
import warnings | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
global df | |
def process_file(file): | |
global df | |
df = pd.read_csv(file.name) | |
columns = df.columns.tolist() | |
return gr.update(choices=columns, value=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns) | |
def run_2sls(dependent_var, endogenous_vars, instruments, exogenous_vars): | |
if not all([dependent_var, endogenous_vars, instruments]): | |
return "Error: Please select all required variables." | |
endogenous_vars = list(endogenous_vars) if isinstance(endogenous_vars, list) else [endogenous_vars] | |
instruments = list(instruments) if isinstance(instruments, list) else [instruments] | |
exogenous_vars = list(exogenous_vars) if exogenous_vars else [] | |
if len(instruments) < len(endogenous_vars): | |
return "Error: The number of instruments must be at least equal to the number of endogenous variables." | |
try: | |
df_selected = df[[dependent_var] + endogenous_vars + instruments + exogenous_vars].dropna() | |
# First stage | |
predicted_vars = [] | |
for var in endogenous_vars: | |
first_stage_formula = f'{var} ~ ' + ' + '.join(instruments + exogenous_vars) | |
first_stage = smf.ols(first_stage_formula, data=df_selected).fit() | |
df_selected[f'{var}_hat'] = first_stage.fittedvalues | |
predicted_vars.append(f'{var}_hat') | |
# Second stage | |
second_stage_formula = f'{dependent_var} ~ ' + ' + '.join(predicted_vars + exogenous_vars) | |
second_stage = smf.ols(second_stage_formula, data=df_selected).fit() | |
# Hausman Test | |
ols_formula = f'{dependent_var} ~ ' + ' + '.join(endogenous_vars + exogenous_vars) | |
ols_model = smf.ols(ols_formula, data=df_selected).fit() | |
residuals = ols_model.resid | |
endogeneity_test_formula = 'residuals ~ ' + ' + '.join(predicted_vars) | |
endogeneity_test = smf.ols(endogeneity_test_formula, data=df_selected.assign(residuals=residuals)).fit() | |
hausman_p_value = endogeneity_test.pvalues.iloc[1] | |
hausman_stat = endogeneity_test.fvalue | |
# Weak Instrument Test | |
weak_instrument_results = "" | |
for var in endogenous_vars: | |
first_stage = smf.ols(f'{var} ~ ' + ' + '.join(instruments + exogenous_vars), data=df_selected).fit() | |
f_stat = first_stage.fvalue | |
weak_instrument_results += f"\nWeak instrument test for {var}: First-stage F-statistic = {f_stat:.5f}" | |
# Sargan Test | |
formula = f'{dependent_var} ~ 1 + ' + ' + '.join(exogenous_vars) + ' + [' + ' + '.join(endogenous_vars) + ' ~ ' + ' + '.join(instruments) + ']' | |
iv_model = IV2SLS.from_formula(formula, data=df_selected).fit() | |
df_selected['iv_resid'] = iv_model.resids | |
sargan_formula = 'iv_resid ~ ' + ' + '.join(instruments + exogenous_vars) | |
sargan_test = smf.ols(sargan_formula, data=df_selected).fit() | |
sargan_stat = len(df_selected) * sargan_test.rsquared | |
sargan_p_value = 1 - stats.chi2.cdf(sargan_stat, df=len(instruments) - len(endogenous_vars)) | |
results = f"2SLS regression results:\n{second_stage.summary()}\n\n" | |
results += f"Hausman test \nNull hypothesis: OLS estimates are consistent\nChi-square(1) = {hausman_stat:.5f}, p-value = {hausman_p_value:.5f}\n" | |
results += weak_instrument_results + "\n" | |
results += f"\nSargan test\nNull hypothesis: all instruments are valid\nSargan statistic = {sargan_stat:.5f}, p-value = {sargan_p_value:.5f}\n\nSargan test (overidentification test) is only applicable when the model is overidentified (IVs > Endogenous X's)." | |
return results | |
except Exception as e: | |
return f"Error: {str(e)}" | |
with gr.Blocks() as app: | |
gr.Markdown("## Two-Stage Least Squares Regression (2SLS)") | |
file_input = gr.File(label="Upload CSV File") | |
with gr.Row(): | |
with gr.Column(): | |
available_columns = gr.Dropdown(label="Available Columns", choices=[], multiselect=True) | |
with gr.Column(): | |
dependent_var = gr.Dropdown(label="Dependent Variable", choices=[]) | |
endogenous_vars = gr.Dropdown(label="Endogenous Variables", choices=[], multiselect=True) | |
instruments = gr.Dropdown(label="Instruments", choices=[], multiselect=True) | |
exogenous_vars = gr.Dropdown(label="Exogenous Variables (optional)", choices=[], multiselect=True) | |
run_button = gr.Button("Run 2SLS Regression") | |
output = gr.Textbox(label="Regression Output", lines=20) | |
file_input.change(process_file, inputs=file_input, outputs=[available_columns, dependent_var, endogenous_vars, instruments, exogenous_vars]) | |
run_button.click(run_2sls, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=output) | |
app.launch() |