"""
# mmregression
# Start 4 August 2024
# Rev 6 April 2025
"""
import arcpy
import pandas
import scipy.stats
from arcgis.features import GeoAccessor, GeoSeriesAccessor
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
def mmregression(input_features, output_features, dependent_name,
                 independent_names, standardize = True,
                transform = True, k_neighbors = 5):
    # Error checks
    if dependent_name in independent_names:
        arcpy.AddError("All independent variables should be different from the dependent variable.")
        raise arcpy.ExecuteError
    
    if len(independent_names) < 2:
        arcpy.AddError("Supply at least two independent variables.")
        raise arcpy.ExecuteError
    # Import feature class into spatial data frame
    # https://developers.arcgis.com/python/guide/introduction-to-the-spatially-enabled-dataframe/
    arcpy.AddMessage("mmregression v2024.04.06 (c) 2024-2025 by Michael Minn")
    arcpy.AddMessage("You can redistribute or modify under the terms of")
    arcpy.AddMessage("version 2 of the GNU General Public License (GPL v2)\n\n")
    arcpy.AddMessage("Loading feature class")
                
    features = pandas.DataFrame.spatial.from_featureclass(input_features)
    features.insert(0, 'FEATUREID', range(1, len(features) + 1))
    variable_names = [dependent_name] + independent_names
    features = features[['FEATUREID'] + variable_names + ["SHAPE"]]
    features.dropna(inplace=True, ignore_index=True)
    for x in variable_names:
        features[x] = features[x].astype(float)
    if len(features) < (k_neighbors * 2):
        arcpy.AddError("There must be at least 2 * k features (n=" +
                              str(len(features)) + ", k=" + str(k_neighbors))
        raise arcpy.ExecuteError
                              
    # Transform
    arcpy.AddMessage("Transform / standardize")
    
    if transform:
        for x in variable_names:
            features[x] = 0.1 - features[x].min() + features[x]
            features[x] = scipy.stats.boxcox(list(features[x]))[0]
    # Standardize
    if standardize:
        for x in variable_names:
            features[x] = (features[x] - features[x].mean()) / features[x].std()
        
    # If k is specified, create long array of nearest neighbors for each feature
    if k_neighbors <= 0:
        arcpy.AddMessage("No neighbors specified: OLS regression only")
    else:
        arcpy.AddMessage("Calculating nearest neighbors (" + str(k_neighbors) + ")")
        # arcpy.AddMessage(str(features.columns))
        neighbor_table_name = "mmregression_neighbors"
        arcpy.analysis.GenerateNearTable(in_features=input_features, \
            near_features=[input_features], out_table=neighbor_table_name, \
            search_radius="1000 Kilometers", \
            closest="ALL", closest_count=k_neighbors, distance_unit="Meters")
        neighbors = pandas.DataFrame.spatial.from_table(neighbor_table_name)
        # Lagged dependent variable
        arcpy.AddMessage("Calculating lagged dependent variable")
        for dest in range(0, len(features)):
            near_fid = neighbors.loc \
                   [neighbors['IN_FID'] == features.at[dest, 'FEATUREID'], 'NEAR_FID']
            features.at[dest, 'LAGVAL'] = features.loc \
                   [features['FEATUREID'].isin(near_fid), dependent_name].mean()
    # Multicollinearity
    vif = pandas.DataFrame()
    vif_data = features[independent_names]
    vif['VIF'] = [variance_inflation_factor(vif_data, i) for i in range(0, len(independent_names))]
    vif['Variable'] = independent_names
    output = str(vif)
    # OLS regression
    
    arcpy.AddMessage("Running regression")
    
    formula = dependent_name + ' ~ ' + '+'.join(independent_names)
    ols_model = smf.ols(formula, data = features[variable_names]).fit()
    features['olsresiduals'] = ols_model.resid
    features['olspredicted'] = features[dependent_name] + features['olsresiduals']
    output = output + '\n\n========= OLS =========\n\n' + str(ols_model.summary())
    if k_neighbors > 0:
        # Spatial lag regression
        formula = dependent_name + ' ~ ' + '+'.join(independent_names) + "+LAGVAL"
        lag_names = variable_names + ['LAGVAL']
        lag_model = smf.ols(formula, data = features[lag_names]).fit()
        features['lagresiduals'] = lag_model.resid
        features['lagpredicted'] = features[dependent_name] + features['lagresiduals']
        listing = pandas.DataFrame()
        listing["coef"] = lag_model.params.round(3)
        listing["std err"] = lag_model.bse.round(3)
        listing["t"] = lag_model.tvalues.round(3)
        listing["P>|t|"] = lag_model.pvalues.round(4)
        listing = listing.iloc[:-1 , :]
        output = output + '\n\n========= Spatial Lag =========\n\n' + listing.to_string()
        output = output + "\n\nAIC: " + str(lag_model.aic.round(4))
        output = output + "\nRho: " + str(lag_model.params['LAGVAL'].round(4))
        output = output + "\nOLS Adj R-squared: " + str(ols_model.rsquared_adj.round(3))
        # Spatial error regression needs lagged residuals from OLS = not sure if this is right
        arcpy.AddMessage("Calculating lagged residuals")
        for dest in range(0, len(features)):
            near_fid = neighbors.loc \
                   [neighbors['IN_FID'] == features.at[dest, 'FEATUREID'], 'NEAR_FID']
            features.at[dest, 'LAGERR'] = features.loc \
                   [features['FEATUREID'].isin(near_fid), 'olsresiduals'].mean()
        
        formula = dependent_name + ' ~ ' + '+'.join(independent_names) + "+LAGERR"
        err_names = variable_names + ['LAGERR']
        err_model = smf.ols(formula, data = features[err_names]).fit()
        features['errresiduals'] = err_model.resid
        listing = pandas.DataFrame()
        listing["coef"] = err_model.params.round(3)
        listing["std err"] = err_model.bse.round(3)
        listing["t"] = err_model.tvalues.round(3)
        listing["P>|t|"] = err_model.pvalues.round(4)
        listing = listing.iloc[:-1 , :]
        output = output + '\n\n========= Spatial Error =========\n\n' + listing.to_string()
        output = output + "\n\nAIC:    " + str(err_model.aic.round(4))
        output = output + "\nLambda: " + str(err_model.params['LAGERR'].round(4))
        output = output + "\nOLS Adj R-squared: " + str(ols_model.rsquared_adj.round(3))    
    # Write output
    arcpy.AddMessage("Writing output feature class")
    features.spatial.to_featureclass(location=output_features, overwrite=True)
    
    return output
if __name__ == "__main__":
    input_features = arcpy.GetParameterAsText(0)
    output_features = arcpy.GetParameterAsText(1)
    dependent_name = arcpy.GetParameterAsText(2)
    independent_names = arcpy.GetParameterAsText(3).split(';')
    standardize = arcpy.GetParameter(4)
    transform = arcpy.GetParameter(5)
    k_neighbors = arcpy.GetParameter(6)
    results = mmregression(input_features, output_features, dependent_name,
                 independent_names, standardize,
                transform, k_neighbors)
    arcpy.AddMessage(results)
    aprx = arcpy.mp.ArcGISProject('CURRENT')
    aprx.activeMap.addDataFromPath(output_features)
    
    # arcpy.SetParameterAsText(7, results)
