2sls / app_archive5.py
pvaluedotone's picture
Rename app.py to app_archive5.py
e122b1e verified
import gradio as gr
import pandas as pd
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS
from scipy import stats
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
global df
def process_file(file):
global df
df = pd.read_csv(file.name)
columns = df.columns.tolist()
return gr.update(choices=columns, value=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns)
def update_dropdowns(dependent_var, endogenous_vars, instruments, exogenous_vars):
used_columns = set(filter(None, [dependent_var] + endogenous_vars + instruments + exogenous_vars))
available_choices = [col for col in df.columns if col not in used_columns]
return (gr.update(choices=available_choices), gr.update(choices=available_choices),
gr.update(choices=available_choices), gr.update(choices=available_choices))
def run_2sls(dependent_var, endogenous_vars, instruments, exogenous_vars):
if not all([dependent_var, endogenous_vars, instruments]):
return "Error: Please select all required variables."
endogenous_vars = list(endogenous_vars) if isinstance(endogenous_vars, list) else [endogenous_vars]
instruments = list(instruments) if isinstance(instruments, list) else [instruments]
exogenous_vars = list(exogenous_vars) if exogenous_vars else []
if len(instruments) < len(endogenous_vars):
return "Error: The number of instruments must be at least equal to the number of endogenous variables."
try:
df_selected = df[[dependent_var] + endogenous_vars + instruments + exogenous_vars].dropna()
# First stage
predicted_vars = []
for var in endogenous_vars:
first_stage_formula = f'{var} ~ ' + ' + '.join(instruments + exogenous_vars)
first_stage = smf.ols(first_stage_formula, data=df_selected).fit()
df_selected[f'{var}_hat'] = first_stage.fittedvalues
predicted_vars.append(f'{var}_hat')
# Second stage
second_stage_formula = f'{dependent_var} ~ ' + ' + '.join(predicted_vars + exogenous_vars)
second_stage = smf.ols(second_stage_formula, data=df_selected).fit()
# Hausman Test
ols_formula = f'{dependent_var} ~ ' + ' + '.join(endogenous_vars + exogenous_vars)
ols_model = smf.ols(ols_formula, data=df_selected).fit()
residuals = ols_model.resid
endogeneity_test_formula = 'residuals ~ ' + ' + '.join(predicted_vars)
endogeneity_test = smf.ols(endogeneity_test_formula, data=df_selected.assign(residuals=residuals)).fit()
hausman_p_value = endogeneity_test.pvalues.iloc[1]
hausman_stat = endogeneity_test.fvalue
# Weak Instrument Test
weak_instrument_results = ""
for var in endogenous_vars:
first_stage = smf.ols(f'{var} ~ ' + ' + '.join(instruments + exogenous_vars), data=df_selected).fit()
f_stat = first_stage.fvalue
weak_instrument_results += f"\nWeak instrument test for {var}: First-stage F-statistic = {f_stat:.5f}"
# Sargan Test
formula = f'{dependent_var} ~ 1 + ' + ' + '.join(exogenous_vars) + ' + [' + ' + '.join(endogenous_vars) + ' ~ ' + ' + '.join(instruments) + ']'
iv_model = IV2SLS.from_formula(formula, data=df_selected).fit()
df_selected['iv_resid'] = iv_model.resids
sargan_formula = 'iv_resid ~ ' + ' + '.join(instruments + exogenous_vars)
sargan_test = smf.ols(sargan_formula, data=df_selected).fit()
sargan_stat = len(df_selected) * sargan_test.rsquared
sargan_p_value = 1 - stats.chi2.cdf(sargan_stat, df=len(instruments) - len(endogenous_vars))
results = f"2SLS regression results\n{second_stage.summary()}\n\n"
results += f"Hausman test \nNull hypothesis: OLS estimates are consistent\nChi-square(1) = {hausman_stat:.5f}, p-value = {hausman_p_value:.5f}\n"
results += weak_instrument_results + "\n"
results += f"\nSargan test\nNull hypothesis: all instruments are valid\nSargan statistic = {sargan_stat:.5f}, p-value = {sargan_p_value:.5f}\n\nSargan test (overidentification test) is only applicable when the model is overidentified (IVs > Endogenous X's."
return results
except Exception as e:
return f"Error: {str(e)}"
with gr.Blocks() as app:
gr.Markdown("## Two-Stage Least Squares Regression (2SLS)")
file_input = gr.File(label="Upload CSV File")
with gr.Row():
with gr.Column():
available_columns = gr.Dropdown(label="Available Columns", choices=[], multiselect=True)
with gr.Column():
dependent_var = gr.Dropdown(label="Dependent Variable", choices=[])
endogenous_vars = gr.Dropdown(label="Endogenous Variables", choices=[], multiselect=True)
instruments = gr.Dropdown(label="Instruments", choices=[], multiselect=True)
exogenous_vars = gr.Dropdown(label="Exogenous Variables (optional)", choices=[], multiselect=True)
run_button = gr.Button("Run 2SLS Regression")
output = gr.Textbox(label="Regression Output", lines=20)
file_input.change(process_file, inputs=file_input, outputs=[available_columns, dependent_var, endogenous_vars, instruments, exogenous_vars])
dependent_var.change(update_dropdowns, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=[endogenous_vars, instruments, exogenous_vars])
endogenous_vars.change(update_dropdowns, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=[dependent_var, instruments, exogenous_vars])
instruments.change(update_dropdowns, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=[dependent_var, endogenous_vars, exogenous_vars])
exogenous_vars.change(update_dropdowns, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=[dependent_var, endogenous_vars, instruments])
run_button.click(run_2sls, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=output)
app.launch()