import gradio as gr import pandas as pd import statsmodels.formula.api as smf from linearmodels.iv import IV2SLS from scipy import stats import warnings warnings.simplefilter(action='ignore', category=FutureWarning) global df def process_file(file): global df df = pd.read_csv(file.name) columns = df.columns.tolist() return gr.update(choices=columns, value=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns) def run_2sls(dependent_var, endogenous_vars, instruments, exogenous_vars): if not all([dependent_var, endogenous_vars, instruments]): return "Error: Please select all required variables." endogenous_vars = list(endogenous_vars) if isinstance(endogenous_vars, list) else [endogenous_vars] instruments = list(instruments) if isinstance(instruments, list) else [instruments] exogenous_vars = list(exogenous_vars) if exogenous_vars else [] if len(instruments) < len(endogenous_vars): return "Error: The number of instruments must be at least equal to the number of endogenous variables." try: df_selected = df[[dependent_var] + endogenous_vars + instruments + exogenous_vars].dropna() # First stage predicted_vars = [] for var in endogenous_vars: first_stage_formula = f'{var} ~ ' + ' + '.join(instruments + exogenous_vars) first_stage = smf.ols(first_stage_formula, data=df_selected).fit() df_selected[f'{var}_hat'] = first_stage.fittedvalues predicted_vars.append(f'{var}_hat') # Second stage second_stage_formula = f'{dependent_var} ~ ' + ' + '.join(predicted_vars + exogenous_vars) second_stage = smf.ols(second_stage_formula, data=df_selected).fit() # Hausman Test ols_formula = f'{dependent_var} ~ ' + ' + '.join(endogenous_vars + exogenous_vars) ols_model = smf.ols(ols_formula, data=df_selected).fit() residuals = ols_model.resid endogeneity_test_formula = 'residuals ~ ' + ' + '.join(predicted_vars) endogeneity_test = smf.ols(endogeneity_test_formula, data=df_selected.assign(residuals=residuals)).fit() hausman_p_value = endogeneity_test.pvalues.iloc[1] hausman_stat = endogeneity_test.fvalue # Weak Instrument Test weak_instrument_results = "" for var in endogenous_vars: first_stage = smf.ols(f'{var} ~ ' + ' + '.join(instruments + exogenous_vars), data=df_selected).fit() f_stat = first_stage.fvalue weak_instrument_results += f"\nWeak instrument test for {var}: First-stage F-statistic = {f_stat:.5f}" # Sargan Test formula = f'{dependent_var} ~ 1 + ' + ' + '.join(exogenous_vars) + ' + [' + ' + '.join(endogenous_vars) + ' ~ ' + ' + '.join(instruments) + ']' iv_model = IV2SLS.from_formula(formula, data=df_selected).fit() df_selected['iv_resid'] = iv_model.resids sargan_formula = 'iv_resid ~ ' + ' + '.join(instruments + exogenous_vars) sargan_test = smf.ols(sargan_formula, data=df_selected).fit() sargan_stat = len(df_selected) * sargan_test.rsquared sargan_p_value = 1 - stats.chi2.cdf(sargan_stat, df=len(instruments) - len(endogenous_vars)) results = f"2SLS regression results:\n{second_stage.summary()}\n\n" results += f"Hausman test \nNull hypothesis: OLS estimates are consistent\nChi-square(1) = {hausman_stat:.5f}, p-value = {hausman_p_value:.5f}\n" results += weak_instrument_results + "\n" results += f"\nSargan test\nNull hypothesis: all instruments are valid\nSargan statistic = {sargan_stat:.5f}, p-value = {sargan_p_value:.5f}\n\nSargan test (overidentification test) is only applicable when the model is overidentified (IVs > Endogenous X's)." return results except Exception as e: return f"Error: {str(e)}" with gr.Blocks() as app: gr.Markdown("## Two-Stage Least Squares Regression (2SLS)") file_input = gr.File(label="Upload CSV File") with gr.Row(): with gr.Column(): available_columns = gr.Dropdown(label="Available Columns", choices=[], multiselect=True) with gr.Column(): dependent_var = gr.Dropdown(label="Dependent Variable", choices=[]) endogenous_vars = gr.Dropdown(label="Endogenous Variables", choices=[], multiselect=True) instruments = gr.Dropdown(label="Instruments", choices=[], multiselect=True) exogenous_vars = gr.Dropdown(label="Exogenous Variables (optional)", choices=[], multiselect=True) run_button = gr.Button("Run 2SLS Regression") output = gr.Textbox(label="Regression Output", lines=20) file_input.change(process_file, inputs=file_input, outputs=[available_columns, dependent_var, endogenous_vars, instruments, exogenous_vars]) run_button.click(run_2sls, inputs=[dependent_var, endogenous_vars, instruments, exogenous_vars], outputs=output) app.launch()