County Competition Review

# Dependencies:
# - pandas: For data manipulation and analysis
# - openpyxl: For creating and editing Excel files with pandas

# Instructions for installing dependencies:
# 1. Install pandas:
#    pip install pandas
#
# 2. Install openpyxl:
#    pip install openpyxl

import pandas as pd
import os

# Define the path to the directory containing county CSV files on the network drive
directory = r'\\fs\Engineering\Cost Quest Raw Data\V7\Merged CSV Files for Import\county_data'

# Define the path to the analysis directory
analysis_directory = os.path.join(directory, 'analysis')

# FIPS to County mapping
fips_to_county = {
    '18001': 'Adams', '18003': 'Allen', '18005': 'Bartholomew', '18007': 'Benton', '18009': 'Blackford', 
    '18011': 'Boone', '18013': 'Brown', '18015': 'Carroll', '18017': 'Cass', '18019': 'Clark', 
    '18021': 'Clay', '18023': 'Clinton', '18025': 'Crawford', '18027': 'Daviess', '18029': 'Dearborn', 
    '18031': 'Decatur', '18033': 'De Kalb', '18035': 'Delaware', '18037': 'Dubois', '18039': 'Elkhart', 
    '18041': 'Fayette', '18043': 'Floyd', '18045': 'Fountain', '18047': 'Franklin', '18049': 'Fulton', 
    '18051': 'Gibson', '18053': 'Grant', '18055': 'Greene', '18057': 'Hamilton', '18059': 'Hancock', 
    '18061': 'Harrison', '18063': 'Hendricks', '18065': 'Henry', '18067': 'Howard', '18069': 'Huntington', 
    '18071': 'Jackson', '18073': 'Jasper', '18075': 'Jay', '18077': 'Jefferson', '18079': 'Jennings', 
    '18081': 'Johnson', '18083': 'Knox', '18085': 'Kosciusko', '18087': 'Lagrange', '18089': 'Lake', 
    '18091': 'La Porte', '18093': 'Lawrence', '18095': 'Madison', '18097': 'Marion', '18099': 'Marshall', 
    '18101': 'Martin', '18103': 'Miami', '18105': 'Monroe', '18107': 'Montgomery', '18109': 'Morgan', 
    '18111': 'Newton', '18113': 'Noble', '18115': 'Ohio', '18117': 'Orange', '18119': 'Owen', 
    '18121': 'Parke', '18123': 'Perry', '18125': 'Pike', '18127': 'Porter', '18129': 'Posey', 
    '18131': 'Pulaski', '18133': 'Putnam', '18135': 'Randolph', '18137': 'Ripley', '18139': 'Rush', 
    '18141': 'St. Joseph', '18143': 'Scott', '18145': 'Shelby', '18147': 'Spencer', '18149': 'Starke', 
    '18151': 'Steuben', '18153': 'Sullivan', '18155': 'Switzerland', '18157': 'Tippecanoe', '18159': 'Tipton', 
    '18161': 'Union', '18163': 'Vanderburgh', '18165': 'Vermillion', '18167': 'Vigo', '18169': 'Wabash', 
    '18171': 'Warren', '18173': 'Warrick', '18175': 'Washington', '18177': 'Wayne', '18179': 'Wells', 
    '18181': 'White', '18183': 'Whitley'
}

# Define the counties to process first
initial_counties = ['18103', '18061', '18153', '18139', '18021', '18015', '18117', '18171', '18073', '18051']

# Function to process a single file and return the results as a DataFrame
def process_county_file(filepath, county_name):
    # Load the data for the specified county with specified dtype for problematic columns
    data = pd.read_csv(filepath, dtype={'BDC_SatelliteProviders': str}, low_memory=False)
    
    # Perform the analysis for the specified county
    total_passings = data['Location_id'].nunique()
    
    # Define a function to calculate passings and market share for a provider type
    def calculate_market_share(provider_column):
        provider_counts = {}
        for entry in data[provider_column].dropna():
            providers = entry.split('|')
            for provider in providers:
                provider = provider.strip()
                if provider in provider_counts:
                    provider_counts[provider] += 1
                else:
                    provider_counts[provider] = 1
        
        provider_details = []
        for provider, count in provider_counts.items():
            if count >= 10:
                market_share = (count / total_passings) * 100
                if market_share >= 1:
                    provider_details.append((provider, count, market_share))
        return provider_details
    
    # Get the details for each provider type
    fiber_providers = calculate_market_share('BDC_FiberProviders')
    cable_providers = calculate_market_share('BDC_CableProviders')
    fw_unlicensed_providers = calculate_market_share('BDC_FWUnlicensedProviders')
    telco_providers = calculate_market_share('BDC_TelcoProviders')
    
    # Store the results in a structured format
    results = []
    results.append(('Total Passings', total_passings, ''))

    results.append(('Fiber Providers', 'Passings', 'Market Share (%)'))
    for provider, passings, market_share in fiber_providers:
        results.append((provider, passings, f"{market_share:.2f}%"))

    results.append(('Cable Providers', 'Passings', 'Market Share (%)'))
    for provider, passings, market_share in cable_providers:
        results.append((provider, passings, f"{market_share:.2f}%"))

    results.append(('Fixed Wireless Providers', 'Passings', 'Market Share (%)'))
    for provider, passings, market_share in fw_unlicensed_providers:
        results.append((provider, passings, f"{market_share:.2f}%"))

    results.append(('Telco Providers', 'Passings', 'Market Share (%)'))
    for provider, passings, market_share in telco_providers:
        results.append((provider, passings, f"{market_share:.2f}%"))
    
    # Convert the results list to a DataFrame
    report_df = pd.DataFrame(results, columns=['Provider', 'Passings', 'Percent of Market'])
    return report_df

# Create an Excel writer object
workbook_file = os.path.join(analysis_directory, 'county_detailed_competition_report.xlsx')
with pd.ExcelWriter(workbook_file, engine='openpyxl') as writer:
    # Process the initial counties first
    for fips in initial_counties:
        county_name = fips_to_county[fips]
        filename = f'county_{fips}.csv'
        filepath = os.path.join(directory, filename)
        if os.path.exists(filepath):
            print(f"Processing file: {filename} for county: {county_name}")
            county_df = process_county_file(filepath, county_name)
            # Add the DataFrame to the workbook
            county_df.to_excel(writer, sheet_name=county_name, index=False)

    # Process all other counties
    for fips, county_name in fips_to_county.items():
        if fips not in initial_counties:
            filename = f'county_{fips}.csv'
            filepath = os.path.join(directory, filename)
            if os.path.exists(filepath):
                print(f"Processing file: {filename} for county: {county_name}")
                county_df = process_county_file(filepath, county_name)
                # Add the DataFrame to the workbook
                county_df.to_excel(writer, sheet_name=county_name, index=False)

print(f"Workbook saved to {workbook_file}")
print("Processing complete.")