In [2]:
#importing datasets function using pandas, parameters: string name of dataset, dataset should be .csv file  

import pandas as pd
def import_data(dataName):
    csv_path_full = dataName
    print (csv_path_full)
    return pd.read_csv(csv_path_full)
    
In [3]:
#calling function
econData = import_data("QOL_data_Economy.csv")
edData = import_data("QOL_data_education.csv")
tranData = import_data("QOL_data_transportation.csv")
engData = import_data("QOL_data_engagement.csv")
houseData = import_data("QOL_data_Housing.csv")

houseData.head()
QOL_data_Economy.csv
QOL_data_education.csv
QOL_data_transportation.csv
QOL_data_engagement.csv
QOL_data_Housing.csv
Out[3]:
NPA Housing_Density_2016 Housing_Units_2016 Housing_Density_2015 Housing_Units_2015 Housing_Density_2013 Housing_Units_2013 Housing_Density_2011 Housing_Units_2011 Single_Family_Housing_2016 ... Subsidized_Housing_2015 Subsidized_Housing_Units_2015 Subsidized_Housing_2013 Subsidized_Housing_Units_2013 Home_Ownership_2015 Home_Ownership_moe_2015 Residential_Occupancy_2014 Residential_Occupancy_moe_2014 Rental_Costs_2014 Rental_Costs_moe_2014
0 2 2.6 1079 2.6 1079 2.6 1081 2.6 1083 47.0 ... 7.0 75.0 7.0 80.0 35.0 5.0 94.0 5.0 754.0 NaN
1 3 4.1 4730 4.1 4752 4.1 4746 3.8 4369 38.0 ... 10.0 472.0 10.0 472.0 47.0 4.0 86.0 2.0 1059.0 NaN
2 4 1.2 406 1.2 404 1.2 402 1.2 401 89.0 ... 0.0 0.0 0.0 0.0 100.0 0.0 98.0 4.0 NaN NaN
3 5 2.0 326 2.0 331 2.0 329 2.0 332 59.0 ... 0.0 0.0 0.0 0.0 25.0 14.0 87.0 7.0 613.0 122.0
4 6 2.0 816 2.0 817 2.0 820 2.1 825 84.0 ... 0.0 0.0 0.0 0.0 28.0 11.0 83.0 5.0 743.0 NaN

5 rows × 83 columns

In [4]:
#econData.describe()
#econData.info()
#engData.info()
houseData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 83 columns):
NPA                                         462 non-null int64
Housing_Density_2016                        462 non-null float64
Housing_Units_2016                          462 non-null int64
Housing_Density_2015                        462 non-null float64
Housing_Units_2015                          462 non-null int64
Housing_Density_2013                        462 non-null float64
Housing_Units_2013                          462 non-null int64
Housing_Density_2011                        462 non-null float64
Housing_Units_2011                          462 non-null int64
Single_Family_Housing_2016                  461 non-null float64
Single_Family_Units_2016                    461 non-null float64
Single_Family_Housing_2015                  461 non-null float64
Single_Family_Units_2015                    461 non-null float64
Single_Family_Housing_2013                  460 non-null float64
Single_Family_Units_2013                    460 non-null float64
Single_Family_Housing_2011                  461 non-null float64
Single_Family_Units_2011                    461 non-null float64
Housing_Size_2016                           442 non-null float64
Housing_Size_2015                           442 non-null float64
Housing_Size_2013                           443 non-null float64
Housing_Size_2011                           442 non-null float64
Housing_Age_2016                            442 non-null float64
Housing_Age_2015                            442 non-null float64
Housing_Age_2013                            443 non-null float64
Housing_Age_2011                            442 non-null float64
Rental_Houses_2016                          436 non-null float64
Rental_Houses_Units_2016                    436 non-null float64
Rental_Houses_2015                          437 non-null float64
Rental_Houses_Units_2015                    437 non-null float64
Rental_Houses_2013                          438 non-null float64
Rental_Houses_Units_2013                    438 non-null float64
New_Residential_2016                        462 non-null float64
New_Residential_Permit_Units_2016           462 non-null int64
New_Residential_2015                        462 non-null float64
New_Residential_Permit_Units_2015           462 non-null int64
New_Residential_2014                        462 non-null float64
New_Residential_Permit_Units_2014           462 non-null int64
New_Residential_2013                        462 non-null float64
New_Residential_Permit_Units_2013           462 non-null int64
New_Residential_2012                        462 non-null float64
New_Residential_Permit_Units_2012           462 non-null int64
New_Residential_2011                        462 non-null float64
New_Residential_Permit_Units_2011           462 non-null int64
Residential_Renovation_2016                 462 non-null float64
Residential_Renovation_Permit_Units_2016    462 non-null int64
Residential_Renovation_2015                 462 non-null float64
Residential_Renovation_Permit_Units_2015    462 non-null int64
Residential_Renovation_2014                 462 non-null float64
Residential_Renovation_Permit_Units_2014    462 non-null int64
Residential_Renovation_2013                 462 non-null float64
Residential_Renovation_Permit_Units_2013    462 non-null int64
Residential_Renovation_2012                 462 non-null float64
Residential_Renovation_Permit_Units_2012    462 non-null int64
Residential_Renovation_2011                 462 non-null float64
Residential_Renovation_Permit_Units_2011    462 non-null int64
Foreclosures_2016                           451 non-null float64
Foreclosed_Units_2016                       451 non-null float64
Foreclosures_2015                           451 non-null float64
Foreclosed_Units_2015                       451 non-null float64
Foreclosures_2013                           450 non-null float64
Foreclosed_Units_2013                       450 non-null float64
Foreclosures_2011                           451 non-null float64
Foreclosed_Units_2011                       451 non-null float64
Home_Sales_Price_2015                       412 non-null float64
Home_Sales_Price_2013                       416 non-null float64
Housing_Violations_2016                     377 non-null float64
Housing_Violations_Total_2016               377 non-null float64
Housing_Violations_2015                     377 non-null float64
Housing_Violations_Total_2015               377 non-null float64
Housing_Violations_2013                     377 non-null float64
Housing_Violations_Total_2013               377 non-null float64
Housing_Violations_2011                     377 non-null float64
Housing_Violations_Total_2011               377 non-null float64
Subsidized_Housing_2015                     461 non-null float64
Subsidized_Housing_Units_2015               461 non-null float64
Subsidized_Housing_2013                     461 non-null float64
Subsidized_Housing_Units_2013               461 non-null float64
Home_Ownership_2015                         459 non-null float64
Home_Ownership_moe_2015                     458 non-null float64
Residential_Occupancy_2014                  459 non-null float64
Residential_Occupancy_moe_2014              458 non-null float64
Rental_Costs_2014                           412 non-null float64
Rental_Costs_moe_2014                       353 non-null float64
dtypes: float64(66), int64(17)
memory usage: 299.7 KB
In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
tranData.hist(bins=50, figsize=(20,15))
plt.show()
In [22]:
totalSet["Board_Committee_Participation_2015"]
Out[22]:
0      0.00
1      3.28
2      3.71
3      0.00
4      0.00
5      0.76
6      0.00
7      1.65
8      1.07
9      2.29
10     0.00
11     0.84
12     0.53
13     1.40
14     0.00
15     0.00
16     1.46
17     0.55
18     0.00
19     0.00
20     4.16
21     0.00
22     0.00
23     0.68
24     0.32
25     0.00
26     3.34
27     0.00
28     0.96
29     1.80
       ... 
432    0.68
433    0.73
434    0.00
435    1.58
436    3.83
437    0.25
438    0.99
439    0.59
440    1.00
441    0.00
442    0.00
443    3.68
444    5.69
445    0.89
446    0.72
447    4.66
448    0.47
449    0.42
450    0.28
451    3.19
452    1.17
453    0.00
454    0.00
455    1.06
456    3.00
457    1.08
458    6.87
459    1.32
460    0.84
461    0.74
Name: Board_Committee_Participation_2015, Length: 462, dtype: float64
In [9]:
frames = [econData, edData, tranData, engData, houseData]
totalSet = pd.concat(frames, axis = 1)
totalSet.head()
Out[9]:
NPA Household_Income_2015 Household_Income_moe_2015 Public_Nutrition_Assistance_2015 Public_Nutrition_Assistance_2013 Public_Nutrition_Assistance_2011 Employment_Rate_2015 Employment_Rate_moe_2015 Job_Density_2014 Jobs_2014 ... Subsidized_Housing_2015 Subsidized_Housing_Units_2015 Subsidized_Housing_2013 Subsidized_Housing_Units_2013 Home_Ownership_2015 Home_Ownership_moe_2015 Residential_Occupancy_2014 Residential_Occupancy_moe_2014 Rental_Costs_2014 Rental_Costs_moe_2014
0 2 45373.0 NaN 17.0 23.0 26.0 87.0 8.0 2.5 1045 ... 7.0 75.0 7.0 80.0 35.0 5.0 94.0 5.0 754.0 NaN
1 3 81048.0 NaN 4.0 4.0 6.0 98.0 1.0 29.2 33769 ... 10.0 472.0 10.0 472.0 47.0 4.0 86.0 2.0 1059.0 NaN
2 4 151974.0 58585.0 0.0 1.0 1.0 100.0 0.0 0.2 54 ... 0.0 0.0 0.0 0.0 100.0 0.0 98.0 4.0 NaN NaN
3 5 20032.0 7523.0 56.0 54.0 61.0 82.0 11.0 0.8 129 ... 0.0 0.0 0.0 0.0 25.0 14.0 87.0 7.0 613.0 122.0
4 6 24092.0 NaN 54.0 46.0 54.0 84.0 5.0 2.2 903 ... 0.0 0.0 0.0 0.0 28.0 11.0 83.0 5.0 743.0 NaN

5 rows × 228 columns

In [21]:
list(totalSet)
Out[21]:
['NPA',
 'Household_Income_2015',
 'Household_Income_moe_2015',
 'Public_Nutrition_Assistance_2015',
 'Public_Nutrition_Assistance_2013',
 'Public_Nutrition_Assistance_2011',
 'Employment_Rate_2015',
 'Employment_Rate_moe_2015',
 'Job_Density_2014',
 'Jobs_2014',
 'Job_Density_2013',
 'Jobs_2013',
 'Job_Density_2012',
 'Jobs_2012',
 'Job_Density_2011',
 'Jobs_2011',
 'Job_Density_2010',
 'Jobs_2010',
 'Job_Density_2009',
 'Jobs_2009',
 'Job_Density_2008',
 'Jobs_2008',
 'Job_Density_2007',
 'Jobs_2007',
 'Job_Density_2006',
 'Jobs_2006',
 'Job_Density_2005',
 'Jobs_2005',
 'Job_Density_2004',
 'Jobs_2004',
 'Job_Density_2003',
 'Jobs_2003',
 'Job_Density_2002',
 'Jobs_2002',
 'Commercial_Construction_2016',
 'Commercial_Construction_Permitted_Units_2016',
 'Commercial_Construction_2015',
 'Commercial_Construction_Permitted_Units_2015',
 'Commercial_Construction_2014',
 'Commercial_Construction_Permitted_Units_2014',
 'Commercial_Construction_2013',
 'Commercial_Construction_Permitted_Units_2013',
 'Commercial_Construction_2012',
 'Commercial_Construction_Permitted_Units_2012',
 'Commercial_Construction_2011',
 'Commercial_Construction_Permitted_Units_2011',
 'Commercial_Size_2016',
 'Commercial_Size_Total_2016',
 'Commercial_Size_2015',
 'Commercial_Size_Total_2015',
 'Commercial_Size_2013',
 'Commercial_Size_Total_2013',
 'Commercial_Size_2011',
 'Commercial_Size_Total_2011',
 'Commercial_Building_Age_2016',
 'Commercial_Building_Age_2015',
 'Commercial_Building_Age_2013',
 'Commercial_Building_Age_11',
 'Fincancial_Services_Proximity_2016',
 'Financial_Services_Proximate_Units_2016',
 'Fincancial_Services_Proximity_2015',
 'Financial_Services_Proximate_Units_2015',
 'Fincancial_Services_Proximity_2014',
 'Financial_Services_Proximate_Units_2014',
 'NPA',
 'Bachelors_Degree_2015',
 'Bachelors_Degree_moe_2015',
 'High_School_Diploma_2015',
 'High_School_Diploma_moe_2015',
 'Early_Care_Proximity_2015',
 'Early_Care_Proximate_Units_2015',
 'Early_Care_Proximity_2013',
 'Early_Care_Proximate_Units_2013',
 'Early_Care_Proximity_2011',
 'Early_Care_Proximate_Units_2011',
 'SchoolAge_Proximity_2015',
 'SchoolAge_Proximate_Units_2015',
 'SchoolAge_Proximity_2013',
 'SchoolAge_Proximate_Units_2013',
 'SchoolAge_Proximity_2011',
 'SchoolAge_Proximate_Units_2011',
 'Library_Card_Prevalence_2015',
 'Library_Card_Holders_2015',
 'Library_Card_Prevalence_2013',
 'Library_Card_Holders_2013',
 'Proficiency_Elementary_School_2014',
 'Proficiency_Elementary_School_2013',
 'Proficiency_Middle_School_2014',
 'Proficiency_Middle_School_2013',
 'Proficiency_High_School_2014',
 'Proficiency_High_School_2013',
 'Highschool_Graduation_Rate_2014',
 'Highschool_Graduation_Rate_2013',
 'Student_Absenteeism_2014',
 'Student_Absenteeism_2013',
 'Neighborhood_School_Attendance_2014',
 'Neighborhood_School_Attendance_2013',
 'Neighborhood_School_Attendance_2011',
 'NPA',
 'Long_Commute_2014',
 'Long_Commute_moe_2014',
 'Bicycle_Friendliness_2015',
 'Bicycle_Friendliness_2013',
 'Bicycle_Friendliness_2011',
 'Street_Connectivity_2016',
 'Street_Connectivity_2015',
 'Street_Connectivity_2013',
 'Street_Connectivity_2011',
 'Sidewalk_Availability_2013',
 'Sidewalk_Miles_2013',
 'Transit_Proximity_2016',
 'Transit_Proximate_Units_2016',
 'Transit_Proximity_2015',
 'Transit_Proximate_Units_2015',
 'Transit_Proximity_2013',
 'Transit_Proximate_Units_2013',
 'Transit_Proximity_2011',
 'Transit_Proximate_Units_2011',
 'Transit_Ridership_2015',
 'Transit_Ridership_Total_2015',
 'Transit_Ridership_2014',
 'Transit_Ridership_Total_2014',
 'Transit_Ridership_2013',
 'Transit_Ridership_Total_2013',
 'NPA',
 'Arts_Participation_2013',
 'Arts_Participating_Households_2013',
 '311_Requests_2015',
 '311_Calls_2015',
 '311_Requests_2013',
 '311_Calls_2013',
 'Voter_Participation_2016',
 'Voters_Participating_2016',
 'Voter_Participation_2015',
 'Voters_Participating_2015',
 'Voter_Participation_2014',
 'Voters_Participating_2014',
 'Voter_Participation_2012',
 'Voters_Participating_2012',
 'Voter_Participation_2010',
 'Voters_Participating_2010',
 'Board_Committee_Participation_2015',
 'Board_Committee_Participants_2015',
 'Neighborhood_Organizations_2015',
 'Neighborhood_Organizations_2014',
 'NPA',
 'Housing_Density_2016',
 'Housing_Units_2016',
 'Housing_Density_2015',
 'Housing_Units_2015',
 'Housing_Density_2013',
 'Housing_Units_2013',
 'Housing_Density_2011',
 'Housing_Units_2011',
 'Single_Family_Housing_2016',
 'Single_Family_Units_2016',
 'Single_Family_Housing_2015',
 'Single_Family_Units_2015',
 'Single_Family_Housing_2013',
 'Single_Family_Units_2013',
 'Single_Family_Housing_2011',
 'Single_Family_Units_2011',
 'Housing_Size_2016',
 'Housing_Size_2015',
 'Housing_Size_2013',
 'Housing_Size_2011',
 'Housing_Age_2016',
 'Housing_Age_2015',
 'Housing_Age_2013',
 'Housing_Age_2011',
 'Rental_Houses_2016',
 'Rental_Houses_Units_2016',
 'Rental_Houses_2015',
 'Rental_Houses_Units_2015',
 'Rental_Houses_2013',
 'Rental_Houses_Units_2013',
 'New_Residential_2016',
 'New_Residential_Permit_Units_2016',
 'New_Residential_2015',
 'New_Residential_Permit_Units_2015',
 'New_Residential_2014',
 'New_Residential_Permit_Units_2014',
 'New_Residential_2013',
 'New_Residential_Permit_Units_2013',
 'New_Residential_2012',
 'New_Residential_Permit_Units_2012',
 'New_Residential_2011',
 'New_Residential_Permit_Units_2011',
 'Residential_Renovation_2016',
 'Residential_Renovation_Permit_Units_2016',
 'Residential_Renovation_2015',
 'Residential_Renovation_Permit_Units_2015',
 'Residential_Renovation_2014',
 'Residential_Renovation_Permit_Units_2014',
 'Residential_Renovation_2013',
 'Residential_Renovation_Permit_Units_2013',
 'Residential_Renovation_2012',
 'Residential_Renovation_Permit_Units_2012',
 'Residential_Renovation_2011',
 'Residential_Renovation_Permit_Units_2011',
 'Foreclosures_2016',
 'Foreclosed_Units_2016',
 'Foreclosures_2015',
 'Foreclosed_Units_2015',
 'Foreclosures_2013',
 'Foreclosed_Units_2013',
 'Foreclosures_2011',
 'Foreclosed_Units_2011',
 'Home_Sales_Price_2015',
 'Home_Sales_Price_2013',
 'Housing_Violations_2016',
 'Housing_Violations_Total_2016',
 'Housing_Violations_2015',
 'Housing_Violations_Total_2015',
 'Housing_Violations_2013',
 'Housing_Violations_Total_2013',
 'Housing_Violations_2011',
 'Housing_Violations_Total_2011',
 'Subsidized_Housing_2015',
 'Subsidized_Housing_Units_2015',
 'Subsidized_Housing_2013',
 'Subsidized_Housing_Units_2013',
 'Home_Ownership_2015',
 'Home_Ownership_moe_2015',
 'Residential_Occupancy_2014',
 'Residential_Occupancy_moe_2014',
 'Rental_Costs_2014',
 'Rental_Costs_moe_2014',
 'houseChange',
 'Lower_Homes_2013',
 'Lower_Homes_2015',
 'lowerHouseChange',
 'constructChange']
In [17]:
houseChange= totalSet["Home_Sales_Price_2015"] - totalSet["Home_Sales_Price_2013"]
totalSet['houseChange'] = houseChange
totalSetc = totalSet
constructChange= totalSet["Commercial_Construction_2015"] - totalSet["Commercial_Construction_2011"]
totalSetc['constructChange'] = constructChange
In [27]:
totalSetH = totalSetc[["Home_Sales_Price_2013", "Housing_Density_2013"]].copy()
totalSetH
Out[27]:
Home_Sales_Price_2013 Housing_Density_2013
0 180860.0 2.6
1 391928.0 4.1
2 845392.0 1.2
3 45200.0 2.0
4 55545.0 2.0
5 1509529.0 0.8
6 229197.0 1.7
7 245591.0 3.3
8 107523.0 2.9
9 455198.0 2.4
10 31576.0 7.9
11 215811.0 4.0
12 140580.0 3.7
13 251774.0 1.2
14 107564.0 2.9
15 81929.0 2.8
16 559354.0 2.8
17 232809.0 2.1
18 192852.0 3.6
19 115917.0 1.3
20 112591.0 1.2
21 107088.0 2.0
22 194133.0 2.6
23 278157.0 1.7
24 152858.0 0.9
25 109670.0 3.1
26 499582.0 2.8
27 210130.0 1.4
28 NaN 1.1
29 423941.0 1.8
... ... ...
432 198006.0 1.5
433 222180.0 0.7
434 173363.0 0.8
435 196016.0 1.8
436 747719.0 0.8
437 269973.0 0.2
438 364032.0 0.4
439 183500.0 0.5
440 170058.0 1.3
441 298981.0 0.2
442 247848.0 0.5
443 248441.0 2.0
444 273905.0 1.3
445 216672.0 1.4
446 169234.0 1.0
447 201116.0 1.0
448 294048.0 0.2
449 166132.0 1.9
450 257122.0 1.5
451 208789.0 1.6
452 189276.0 0.3
453 224101.0 1.4
454 NaN 0.1
455 377236.0 0.3
456 361760.0 2.1
457 169657.0 2.3
458 409188.0 1.3
459 250295.0 2.1
460 324571.0 2.3
461 389919.0 0.9

462 rows × 2 columns

In [99]:
median = totalSetc["Home_Sales_Price_2013"].median()
median2 = totalSetc["Home_Sales_Price_2015"].median()
median, median2
totalSetc["Lower_Homes_2013"]=totalSetc["Home_Sales_Price_2013"].where(totalSetc["Home_Sales_Price_2013"] < median, inplace=False)
totalSetc["Lower_Homes_2015"]=totalSetc["Home_Sales_Price_2015"].where(totalSetc["Home_Sales_Price_2015"] < median2, inplace=False)
In [242]:
#totalSetc["Lower_Homes_2015"]
lowerHouseChange = totalSetc["Lower_Homes_2015"]- totalSetc["Lower_Homes_2013"]


lowerHouseChange
totalSetc["lowerHouseChange"]= lowerHouseChange
totalSetS = totalSetc[["lowerHouseChange","Commercial_Construction_2015","Household_Income_2015" "Employment_Rate_2015","Arts_Participation_2013","Arts_Participating_Households_2013", "Neighborhood_Organizations_2014", "Neighborhood_Organizations_2015", "Board_Committee_Participation_2015", "Housing_Density_2011", "Housing_Density_2013", "Housing_Density_2015", "Single_Family_Housing_2015","Single_Family_Housing_2013", "Single_Family_Housing_2011", "Single_Family_Units_2015", "Single_Family_Units_2013", "Single_Family_Units_2011", "311_Requests_2015","311_Requests_2013","311_Calls_2015", "311_Calls_2013", "Voter_Participation_2015", "Voter_Participation_2014", "Voter_Participation_2012", "Voter_Participation_2010", "Job_Density_2013", "Job_Density_2012", "New_Residential_2015", "New_Residential_2014", "New_Residential_2013", "New_Residential_2012", "New_Residential_2011", 'Transit_Proximity_2015','Transit_Proximate_Units_2015','Transit_Proximity_2013','Transit_Proximate_Units_2013','Transit_Proximity_2011','Transit_Proximate_Units_2011', 'Bicycle_Friendliness_2015','Bicycle_Friendliness_2013','Bicycle_Friendliness_2011',
 'Street_Connectivity_2015',
 'Street_Connectivity_2013',
 'Street_Connectivity_2011',  
                      'High_School_Diploma_2015',
 'Early_Care_Proximity_2015',
 'Early_Care_Proximate_Units_2015',
 'Early_Care_Proximity_2013',
 'Early_Care_Proximate_Units_2013',
 'Early_Care_Proximity_2011',
 'Early_Care_Proximate_Units_2011',
 'SchoolAge_Proximity_2015',
 'SchoolAge_Proximate_Units_2015',
 'SchoolAge_Proximity_2013',
 'SchoolAge_Proximate_Units_2013',
 'SchoolAge_Proximity_2011',
 'SchoolAge_Proximate_Units_2011',
                     'Proficiency_Elementary_School_2013',
 'Proficiency_Middle_School_2014',
 'Proficiency_Middle_School_2013',
 'Proficiency_High_School_2014',
 'Proficiency_High_School_2013',
                      'Neighborhood_School_Attendance_2014',
 'Neighborhood_School_Attendance_2013',
 'Neighborhood_School_Attendance_2011',
'Job_Density_2014',
 'Commercial_Construction_2014',
 'Commercial_Construction_Permitted_Units_2014',
 'Commercial_Construction_2013',
 'Commercial_Construction_Permitted_Units_2013',
 'Commercial_Construction_2012',
 'Commercial_Construction_Permitted_Units_2012',
 'Commercial_Construction_2011',
 'Commercial_Construction_Permitted_Units_2011',
 'Bachelors_Degree_2015',
'Home_Sales_Price_2015',
 'Home_Sales_Price_2013',
 'Housing_Size_2015',
 'Housing_Size_2013']].copy()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/.local/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2441             try:
-> 2442                 return self._engine.get_loc(key)
   2443             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5280)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20523)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20477)()

KeyError: 'Household_Income_2013'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-242-abe3376360c9> in <module>()
      1 #totalSetc["Lower_Homes_2015"]
      2 lowerHouseChange = totalSetc["Lower_Homes_2015"]- totalSetc["Lower_Homes_2013"]
----> 3 incomeChange = totalSetc["Household_Income_2015"]- totalSetc["Household_Income_2013"]
      4 constructChange = totalSetc["Commercial_Construction_2015"] - totalSetc["Commercial_Construction_2013"]
      5 employChange = totalSetc["Employment_Rate_2015"] - totalSetc["Exmployment_Rate_2013"]

~/.local/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1962             return self._getitem_multilevel(key)
   1963         else:
-> 1964             return self._getitem_column(key)
   1965 
   1966     def _getitem_column(self, key):

~/.local/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   1974         result = self._constructor(self._data.get(key))
   1975         if result.columns.is_unique:
-> 1976             result = result[key]
   1977 
   1978         return result

~/.local/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1962             return self._getitem_multilevel(key)
   1963         else:
-> 1964             return self._getitem_column(key)
   1965 
   1966     def _getitem_column(self, key):

~/.local/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   1969         # get column
   1970         if self.columns.is_unique:
-> 1971             return self._get_item_cache(key)
   1972 
   1973         # duplicate columns & possible reduce dimensionality

~/.local/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1643         res = cache.get(item)
   1644         if res is None:
-> 1645             values = self._data.get(item)
   1646             res = self._box_item_values(item, values)
   1647             cache[item] = res

~/.local/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3588 
   3589             if not isnull(item):
-> 3590                 loc = self.items.get_loc(item)
   3591             else:
   3592                 indexer = np.arange(len(self.items))[isnull(self.items)]

~/.local/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2442                 return self._engine.get_loc(key)
   2443             except KeyError:
-> 2444                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2445 
   2446         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5280)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20523)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas/_libs/hashtable.c:20477)()

KeyError: 'Household_Income_2013'
In [240]:
totalSetS = totalSetS[np.isfinite(totalSetS['lowerHouseChange'])]
list(totalSetS)
Out[240]:
['lowerHouseChange',
 'Commercial_Construction_2015',
 'Household_Income_2015',
 'Employment_Rate_2015',
 'Arts_Participation_2013',
 'Arts_Participating_Households_2013',
 'Neighborhood_Organizations_2014',
 'Neighborhood_Organizations_2015',
 'Board_Committee_Participation_2015',
 'Housing_Density_2011',
 'Housing_Density_2013',
 'Housing_Density_2015',
 'Single_Family_Housing_2015',
 'Single_Family_Housing_2013',
 'Single_Family_Housing_2011',
 'Single_Family_Units_2015',
 'Single_Family_Units_2013',
 'Single_Family_Units_2011',
 '311_Requests_2015',
 '311_Requests_2013',
 '311_Calls_2015',
 '311_Calls_2013',
 'Voter_Participation_2015',
 'Voter_Participation_2014',
 'Voter_Participation_2012',
 'Voter_Participation_2010',
 'Job_Density_2013',
 'Job_Density_2012',
 'New_Residential_2015',
 'New_Residential_2014',
 'New_Residential_2013',
 'New_Residential_2012',
 'New_Residential_2011',
 'Transit_Proximity_2015',
 'Transit_Proximate_Units_2015',
 'Transit_Proximity_2013',
 'Transit_Proximate_Units_2013',
 'Transit_Proximity_2011',
 'Transit_Proximate_Units_2011',
 'Bicycle_Friendliness_2015',
 'Bicycle_Friendliness_2013',
 'Bicycle_Friendliness_2011',
 'Street_Connectivity_2015',
 'Street_Connectivity_2013',
 'Street_Connectivity_2011',
 'High_School_Diploma_2015',
 'Early_Care_Proximity_2015',
 'Early_Care_Proximate_Units_2015',
 'Early_Care_Proximity_2013',
 'Early_Care_Proximate_Units_2013',
 'Early_Care_Proximity_2011',
 'Early_Care_Proximate_Units_2011',
 'SchoolAge_Proximity_2015',
 'SchoolAge_Proximate_Units_2015',
 'SchoolAge_Proximity_2013',
 'SchoolAge_Proximate_Units_2013',
 'SchoolAge_Proximity_2011',
 'SchoolAge_Proximate_Units_2011',
 'Proficiency_Elementary_School_2013',
 'Proficiency_Middle_School_2014',
 'Proficiency_Middle_School_2013',
 'Proficiency_High_School_2014',
 'Proficiency_High_School_2013',
 'Neighborhood_School_Attendance_2014',
 'Neighborhood_School_Attendance_2013',
 'Neighborhood_School_Attendance_2011',
 'Job_Density_2014',
 'Commercial_Construction_2014',
 'Commercial_Construction_Permitted_Units_2014',
 'Commercial_Construction_2013',
 'Commercial_Construction_Permitted_Units_2013',
 'Commercial_Construction_2012',
 'Commercial_Construction_Permitted_Units_2012',
 'Commercial_Construction_2011',
 'Commercial_Construction_Permitted_Units_2011',
 'Bachelors_Degree_2015',
 'Home_Sales_Price_2015',
 'Home_Sales_Price_2013',
 'Housing_Size_2015',
 'Housing_Size_2013']
In [241]:
totalSetS.to_csv("trimmed_QOL_set")
In [233]:
corr_matrix = totalSetc.corr()

corr_matrix["Household_Income_2015"].sort_values(ascending=False)
Out[233]:
Household_Income_2015                       1.000000
Arts_Participation_2013                     0.817279
Proficiency_Elementary_School_2013          0.799335
Proficiency_Elementary_School_2014          0.792465
Proficiency_Middle_School_2014              0.787226
Bachelors_Degree_2015                       0.784466
Housing_Size_2011                           0.780981
Housing_Size_2013                           0.778506
Housing_Size_2015                           0.775954
Housing_Size_2016                           0.775171
Proficiency_Middle_School_2013              0.774901
Home_Sales_Price_2013                       0.736096
Voter_Participation_2016                    0.712375
Home_Sales_Price_2015                       0.703028
Proficiency_High_School_2014                0.677863
Proficiency_High_School_2013                0.676415
Voter_Participation_2010                    0.664039
Voter_Participation_2014                    0.654535
Lower_Homes_2015                            0.646897
Lower_Homes_2013                            0.645736
Voter_Participation_2012                    0.638637
Home_Ownership_2015                         0.632181
Rental_Costs_2014                           0.630450
Household_Income_moe_2015                   0.590385
High_School_Diploma_2015                    0.589057
Highschool_Graduation_Rate_2013             0.576691
Highschool_Graduation_Rate_2014             0.572506
Voter_Participation_2015                    0.533709
Employment_Rate_2015                        0.531959
Residential_Renovation_Permit_Units_2011    0.501566
                                              ...   
Home_Ownership_moe_2015                    -0.321126
Foreclosures_2011                          -0.334360
Residential_Occupancy_moe_2014             -0.339042
Housing_Violations_2011                    -0.371715
Foreclosures_2015                          -0.377555
Transit_Proximity_2016                     -0.377964
Housing_Violations_Total_2015              -0.385252
Transit_Proximity_2015                     -0.389469
Housing_Violations_Total_2013              -0.391820
Transit_Proximity_2013                     -0.396074
Foreclosures_2013                          -0.398910
Transit_Proximity_2011                     -0.407221
Housing_Violations_Total_2016              -0.424883
Housing_Violations_2015                    -0.435578
Housing_Violations_2013                    -0.449957
SchoolAge_Proximity_2015                   -0.451245
Early_Care_Proximity_2011                  -0.459441
Housing_Violations_2016                    -0.477380
Early_Care_Proximity_2015                  -0.483102
Early_Care_Proximity_2013                  -0.492046
SchoolAge_Proximity_2011                   -0.503924
SchoolAge_Proximity_2013                   -0.511659
Rental_Houses_2013                         -0.583023
Rental_Houses_2016                         -0.608424
Rental_Houses_2015                         -0.628094
Student_Absenteeism_2013                   -0.630296
Student_Absenteeism_2014                   -0.663807
Public_Nutrition_Assistance_2015           -0.698487
Public_Nutrition_Assistance_2013           -0.708186
Public_Nutrition_Assistance_2011           -0.713454
Name: Household_Income_2015, Length: 233, dtype: float64
In [103]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size= int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
In [104]:
train_setS, test_setS = split_train_test(totalSetS, 0.2)
In [105]:
train_setS.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 370 entries, 433 to 209
Data columns (total 67 columns):
lowerHouseChange                       154 non-null float64
Commercial_Construction_2015           370 non-null float64
Household_Income_2015                  361 non-null float64
Employment_Rate_2015                   368 non-null float64
Arts_Participation_2013                366 non-null float64
Arts_Participating_Households_2013     366 non-null float64
Neighborhood_Organizations_2014        326 non-null float64
Neighborhood_Organizations_2015        297 non-null float64
Board_Committee_Participation_2015     368 non-null float64
Housing_Density_2011                   370 non-null float64
Housing_Density_2013                   370 non-null float64
Housing_Density_2015                   370 non-null float64
Single_Family_Housing_2015             369 non-null float64
Single_Family_Housing_2013             368 non-null float64
Single_Family_Housing_2011             369 non-null float64
Single_Family_Units_2015               369 non-null float64
Single_Family_Units_2013               368 non-null float64
Single_Family_Units_2011               369 non-null float64
311_Requests_2015                      368 non-null float64
311_Requests_2013                      368 non-null float64
311_Calls_2015                         368 non-null float64
311_Calls_2013                         368 non-null float64
Voter_Participation_2015               368 non-null float64
Voter_Participation_2014               368 non-null float64
Voter_Participation_2012               368 non-null float64
Voter_Participation_2010               368 non-null float64
Job_Density_2013                       370 non-null float64
Job_Density_2012                       370 non-null float64
New_Residential_2015                   370 non-null float64
New_Residential_2014                   370 non-null float64
New_Residential_2013                   370 non-null float64
New_Residential_2012                   370 non-null float64
New_Residential_2011                   370 non-null float64
Transit_Proximity_2015                 369 non-null float64
Transit_Proximate_Units_2015           369 non-null float64
Transit_Proximity_2013                 369 non-null float64
Transit_Proximate_Units_2013           369 non-null float64
Transit_Proximity_2011                 369 non-null float64
Transit_Proximate_Units_2011           369 non-null float64
Bicycle_Friendliness_2015              370 non-null float64
Bicycle_Friendliness_2013              370 non-null float64
Bicycle_Friendliness_2011              306 non-null float64
Street_Connectivity_2015               370 non-null float64
Street_Connectivity_2013               370 non-null float64
Street_Connectivity_2011               308 non-null float64
High_School_Diploma_2015               368 non-null float64
Early_Care_Proximity_2015              369 non-null float64
Early_Care_Proximate_Units_2015        369 non-null float64
Early_Care_Proximity_2013              369 non-null float64
Early_Care_Proximate_Units_2013        369 non-null float64
Early_Care_Proximity_2011              369 non-null float64
Early_Care_Proximate_Units_2011        369 non-null float64
SchoolAge_Proximity_2015               369 non-null float64
SchoolAge_Proximate_Units_2015         369 non-null float64
SchoolAge_Proximity_2013               369 non-null float64
SchoolAge_Proximate_Units_2013         369 non-null float64
SchoolAge_Proximity_2011               369 non-null float64
SchoolAge_Proximate_Units_2011         369 non-null float64
Proficiency_Elementary_School_2013     362 non-null float64
Proficiency_Middle_School_2014         359 non-null float64
Proficiency_Middle_School_2013         356 non-null float64
Proficiency_High_School_2014           313 non-null float64
Proficiency_High_School_2013           329 non-null float64
Neighborhood_School_Attendance_2014    367 non-null float64
Neighborhood_School_Attendance_2013    366 non-null float64
Neighborhood_School_Attendance_2011    367 non-null float64
Job_Density_2014                       370 non-null float64
dtypes: float64(67)
memory usage: 196.6 KB
In [106]:
trainData = train_setS[np.isfinite(train_setS['lowerHouseChange'])]
trainData.head()
Out[106]:
lowerHouseChange Commercial_Construction_2015 Household_Income_2015 Employment_Rate_2015 Arts_Participation_2013 Arts_Participating_Households_2013 Neighborhood_Organizations_2014 Neighborhood_Organizations_2015 Board_Committee_Participation_2015 Housing_Density_2011 ... SchoolAge_Proximate_Units_2011 Proficiency_Elementary_School_2013 Proficiency_Middle_School_2014 Proficiency_Middle_School_2013 Proficiency_High_School_2014 Proficiency_High_School_2013 Neighborhood_School_Attendance_2014 Neighborhood_School_Attendance_2013 Neighborhood_School_Attendance_2011 Job_Density_2014
8 34243.0 1.09 25227.0 88.0 7.0 105.0 2.0 2.0 1.07 2.9 ... 1599.0 13.3 29.8 15.5 6.7 23.1 71.0 77.0 53.0 1.1
379 -10371.0 3.79 27958.0 80.0 2.0 29.0 7.0 7.0 0.21 2.5 ... 2015.0 12.0 15.8 13.0 20.4 12.3 71.0 75.0 67.0 0.6
325 83.0 2.69 41250.0 98.0 5.0 32.0 2.0 2.0 0.62 0.5 ... 643.0 45.8 41.9 18.8 33.3 NaN 59.0 63.0 73.0 4.9
365 2153.0 1.15 27973.0 84.0 3.0 61.0 5.0 5.0 0.00 1.6 ... 2258.0 14.5 21.7 13.3 32.4 26.1 72.0 74.0 68.0 2.0
198 5236.0 0.00 32371.0 76.0 4.0 28.0 2.0 2.0 0.00 2.8 ... 713.0 19.2 26.1 18.8 17.9 12.1 81.0 83.0 83.0 1.0

5 rows × 67 columns

In [223]:
totalform = totalSetS[np.isfinite(totalSetS['lowerHouseChange'])]
totalform.to_csv("trimmed_QOL_dataset")
In [129]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='median')
#imputer.fit(trainData)
#trainDataPrepared = imputer.transform(trainData)
In [130]:
imputer.fit(trainData)
Out[130]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
In [131]:
trainData.isnull()
Out[131]:
lowerHouseChange Commercial_Construction_2015 Household_Income_2015 Employment_Rate_2015 Arts_Participation_2013 Arts_Participating_Households_2013 Neighborhood_Organizations_2014 Neighborhood_Organizations_2015 Board_Committee_Participation_2015 Housing_Density_2011 ... SchoolAge_Proximate_Units_2011 Proficiency_Elementary_School_2013 Proficiency_Middle_School_2014 Proficiency_Middle_School_2013 Proficiency_High_School_2014 Proficiency_High_School_2013 Neighborhood_School_Attendance_2014 Neighborhood_School_Attendance_2013 Neighborhood_School_Attendance_2011 Job_Density_2014
8 False False False False False False False False False False ... False False False False False False False False False False
379 False False False False False False False False False False ... False False False False False False False False False False
325 False False False False False False False False False False ... False False False False False True False False False False
365 False False False False False False False False False False ... False False False False False False False False False False
198 False False False False False False False False False False ... False False False False False False False False False False
135 False False False False False False False False False False ... False False False False False False False False False False
262 False False False False False False False False False False ... False False False False False False False False False False
182 False False False False False False False False False False ... False False False False False False False False False False
254 False False False False False False False False False False ... False False False False False False False False False False
380 False False False False False False False False False False ... False False False False True False False False False False
148 False False False False False False False False False False ... False False False False False False False False False False
86 False False False False False False False False False False ... False False False False False False False False False False
399 False False False False False False True True False False ... False False False False False False False False False False
132 False False False False False False False False False False ... False False False False False False False False False False
255 False False False False False False False False False False ... False False False False False False False False False False
21 False False False False False False False False False False ... False False False False False False False False False False
156 False False False False False False False False False False ... False False False False False False False False False False
50 False False False False False False False False False False ... False False False False False False False False False False
371 False False False False False False False False False False ... False False False False False False False False False False
316 False False False False False False False False False False ... False False False False True False False False False False
233 False False False False False False False False False False ... False False False False False False False False False False
232 False False False False False False False False False False ... False False False False False False False False False False
99 False False False False False False False False False False ... False False False False False False False False False False
35 False False False False False False False False False False ... False False False False False False False False False False
137 False False False False False False False False False False ... False False False False False False False False False False
176 False False False False False False False False False False ... False False False False False False False False False False
103 False False False False False False False False False False ... False False False False False False False False False False
85 False False False False False False False False False False ... False False False False False False False False False False
48 False False False False False False False False False False ... False False False False False False False False False False
186 False False True False False False False False False False ... False False False False False True False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
160 False False False False False False False False False False ... False False False False False False False False False False
44 False False False False False False False False False False ... False False False False False False False False False False
398 False False False False False False True True False False ... False False False False False False False False False False
151 False False False False False False False False False False ... False False False False False False False False False False
377 False False False False False False False False False False ... False False False False False False False False False False
323 False False False False False False False False False False ... False False False False False False False False False False
59 False False False False False False False False False False ... False False False False False True False False False False
326 False False False False False False False False False False ... False False False False False False False False False False
239 False False False False False False False False False False ... False False False False False False False False False False
104 False False False False False False False True False False ... False False False False False False False False False False
192 False False False False False False False False False False ... False False False False False False False False False False
308 False False False False False False False False False False ... False False False False False False False False False False
376 False False False False False False False False False False ... False False False False False False False False False False
348 False False False False False False False False False False ... False False False False False True False False False False
55 False False False False False False False False False False ... False False False False False False False False False False
51 False False False False False False False False False False ... False False False False False False False False False False
426 False False False False False False True True False False ... False False False False False False False False False False
383 False False False False False False False False False False ... False False False False False False False False False False
319 False False False False False False False False False False ... False False False False False False False False False False
356 False False False False False False False False False False ... False False False False False False False False False False
67 False False False False False False False False False False ... False False False False False False False False False False
216 False False False False False False False False False False ... False False False False False False False False False False
70 False False False False False False False False False False ... False False False False False False False False False False
265 False False False False False False False False False False ... False False False False False False False False False False
123 False False False False False False False False False False ... False False False False False False False False False False
283 False False False False False False False False False False ... False False False False False False False False False False
25 False False False False False False False False False False ... False False False False False False False False False False
259 False False False False False False False False False False ... False False False False False False False False False False
274 False False False False False False False False False False ... False False False False False False False False False False
170 False False False False False False False False False False ... False False False False False False False False False False

154 rows × 67 columns

In [132]:
trainDataPrepared = imputer.transform(trainData)
In [136]:
trainDataPrepared
Out[136]:
array([[  3.42430000e+04,   1.09000000e+00,   2.52270000e+04, ...,
          7.70000000e+01,   5.30000000e+01,   1.10000000e+00],
       [ -1.03710000e+04,   3.79000000e+00,   2.79580000e+04, ...,
          7.50000000e+01,   6.70000000e+01,   6.00000000e-01],
       [  8.30000000e+01,   2.69000000e+00,   4.12500000e+04, ...,
          6.30000000e+01,   7.30000000e+01,   4.90000000e+00],
       ..., 
       [  1.21560000e+04,   3.60000000e-01,   7.63710000e+04, ...,
          8.20000000e+01,   8.40000000e+01,   5.00000000e-01],
       [  1.34670000e+04,   1.42000000e+00,   3.17030000e+04, ...,
          8.70000000e+01,   7.30000000e+01,   3.10000000e+00],
       [  3.35200000e+03,   1.20000000e+00,   5.75960000e+04, ...,
          7.40000000e+01,   4.70000000e+01,   8.00000000e-01]])
In [144]:
import numpy
numpy.isnan(trainDataPrepared).any()
Out[144]:
False
In [149]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [150]:
scaler.fit(trainDataPrepared)
Out[150]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [151]:
trainPrep= scaler.transform(trainDataPrepared)
In [153]:
numpy.isnan(trainPrep).any()
Out[153]:
False
In [192]:
testData = test_setS[np.isfinite(test_setS['lowerHouseChange'])]
imputer4 = Imputer(missing_values='NaN', strategy='median')
imputer4.fit(testData)
Out[192]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
In [193]:
testDataPrepared = imputer.transform(testData)
In [194]:
scaler2 =StandardScaler()
scaler.fit(testDataPrepared)
Out[194]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [154]:
trainPrep.shape
Out[154]:
(154, 67)
In [195]:
testPrep = scaler.transform(testDataPrepared)
In [156]:
imputer2 = Imputer()
imputer2.fit(trainPrep)
trainImp= imputer.transform(trainPrep)
In [176]:
#find_labels = trainData[["lowerHouseChange", "Commercial_Construction_2015","Household_Income_2015","Employment_Rate_2015"  ]].copy()
find_labels = trainData[["lowerHouseChange"]].copy()
In [177]:
imputer3 = Imputer()
In [178]:
imputer3.fit(find_labels)
Out[178]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
In [179]:
labelImp=imputer3.transform(find_labels)
In [231]:
trainImp
Out[231]:
array([[ 1.95490933, -0.08916177, -1.3079209 , ..., -0.09589613,
        -1.31985547, -0.21945196],
       [-1.20417041,  1.32964669, -1.14275497, ..., -0.31148743,
        -0.22158901, -0.44718512],
       [-0.46393148,  0.75161361, -0.33887892, ..., -1.60503525,
         0.24909661,  1.51132007],
       ..., 
       [ 0.39094747, -0.47276554,  1.78517548, ...,  0.44308212,
         1.11202026, -0.49273175],
       [ 0.48377828,  0.08424815, -0.91626416, ...,  0.98206038,
         0.24909661,  0.69148069],
       [-0.23245635, -0.03135847,  0.64969752, ..., -0.41928308,
        -1.79054109, -0.35609185]])
In [180]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(trainImp, labelImp)
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  This is separate from the ipykernel package so we can avoid doing imports until
Out[180]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
In [186]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())
In [187]:
display_scores(forest_rmse_scores)
Scores: [ 3471.76440661   285.79436226  2334.74771844  1963.64404297  8340.34662859
   315.14488335   281.81858526   451.8711791    661.73458048  5540.20928113
  1595.0687975    567.74354466  1054.45487279  1550.72395996  2064.93116489
   245.36361938   543.92050233  9956.67726604  1097.87064356   640.83877179]
Mean: 2148.23344055
Standard Deviation: 2664.34193673
In [184]:
trainData["lowerHouseChange"]
Out[184]:
8      34243.0
379   -10371.0
325       83.0
365     2153.0
198     5236.0
135   -21333.0
262     3625.0
182    13115.0
254     1767.0
380    10269.0
148     5673.0
86      2787.0
399    13673.0
132    15561.0
255    16379.0
21      9957.0
156     8786.0
50     10889.0
371     4840.0
316    -3657.0
233     8258.0
232    -9427.0
99    -12055.0
35      1091.0
137    -5083.0
176    -6016.0
103    -2075.0
85    -15292.0
48     -3866.0
186    11426.0
        ...   
160    46753.0
44      9438.0
398    -5695.0
151     4798.0
377     8954.0
323     4983.0
59      4022.0
326      -67.0
239    18211.0
104    -9077.0
192    27449.0
308    15654.0
376    22947.0
348    28545.0
55     72625.0
51     21476.0
426    11377.0
383     1429.0
319     5180.0
356   -16152.0
67     -3381.0
216    22365.0
70    -15667.0
265     9848.0
123    16200.0
283     8557.0
25       885.0
259    12156.0
274    13467.0
170     3352.0
Name: lowerHouseChange, Length: 154, dtype: float64
In [226]:
test = testPrep[22]
trainPredict = forest_reg.predict(trainImp)
In [251]:
from sklearn import cross_validation
model = RandomForestRegressor(n_estimators=100)

#Simple K-Fold cross validation. 10 folds.
cv = cross_validation.KFold(len(trainImp), n_folds=10)

results = []
# "Error_function" can be replaced by the error function of your analysis
for traincv, testcv in cv:
    model.fit(trainImp, labelImp)
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
/home/alex/.local/lib/python3.5/site-packages/ipykernel_launcher.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  # Remove the CWD from sys.path while we load stuff.
In [257]:
from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5, random_state=42)
svm_reg.fit(trainImp, labelImp)
/home/alex/.local/lib/python3.5/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[257]:
LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=42, tol=0.0001, verbose=0)
In [260]:
svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)
svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)
svm_reg1.fit(trainImp, labelImp)
svm_reg2.fit(trainImp, labelImp)

def find_support_vectors(svm_reg, trainImp, labelImp):
    y_pred = svm_reg.predict(testPrep)
    off_margin = (np.abs(labelImp - y_pred) >= svm_reg.epsilon)
    return np.argwhere(off_margin)

svm_reg1.support_ = find_support_vectors(svm_reg1, trainImp, labelImp)
svm_reg2.support_ = find_support_vectors(svm_reg2, trainImp, labelImp)

eps_x1 = 1
eps_y_pred = svm_reg1.predict(testPrep)
/home/alex/.local/lib/python3.5/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [261]:
eps_y_pred
Out[261]:
array([  -7.19667455, -484.46545415,  -85.54076415,   92.99986668,
          9.62130403,  311.86835696,  590.1876458 ,  607.59604855,
        111.22439289,  -28.71705254, -201.49094587, -896.85040837,
        -72.33448147, -154.37616691,  505.11950041,  504.65265345,
       -254.5124657 , -768.16197566,   51.93709788,  987.79722529,
        213.18255611, -322.26033251,  225.8515049 ,  -38.29544506,
        465.2789691 ,  253.80424175,  206.70054122, -724.08580994,
        538.49344628, -651.05173239,  715.3853906 ,  512.82081018,
        160.81815721])
In [202]:
test_labels = testData[["lowerHouseChange"]].copy()
In [203]:
imputer4 = Imputer()
imputer4.fit(test_labels)
Out[203]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
In [204]:
imputer4.transform(test_labels)
Out[204]:
array([[ 14139.],
       [  5583.],
       [ 46115.],
       [  6521.],
       [ 11116.],
       [  6735.],
       [  5145.],
       [  7141.],
       [  1853.],
       [ 11234.],
       [ 16135.],
       [-54821.],
       [ -2001.],
       [ 14096.],
       [  3577.],
       [ 14866.],
       [  5344.],
       [-18868.],
       [ 32600.],
       [   787.],
       [ 11732.],
       [  1518.],
       [ 67888.],
       [ 41799.],
       [  1979.],
       [  6227.],
       [  4810.],
       [ -8000.],
       [ 10839.],
       [  9150.],
       [  -996.],
       [  9374.],
       [ 10315.]])
In [1]:
#from sklearn.model_selection import GridSearchCV

#param_grid = [
   # {'n_estimators': [3,10,30, 40, 50, 60 , 70, 80 , 90, 100], 'max_features':[2,4,6,8, 10, 12, 14, 16, 18, 20]},
    #{'bootstrap': [False], 'n_estimators': [3,10], 'max_features':[2,3,4]},    
#]
#forest_reg2 = RandomForestRegressor()

#grid_search = GridSearchCV(forest_reg2, param_grid, cv=5, scoring='neg_mean_squared_error')

#grid_search.fit(trainImp, labelImp)
In [211]:
model = grid_search.best_estimator_
In [218]:
testPred = model.predict(testPrep)
In [219]:
testPred
Out[219]:
array([  9283.18,   5411.28,  15775.02,   3885.46,  11029.08,   4375.72,
         5459.56,   3508.52,   2131.14,  10319.18,  10169.42,  -9547.46,
         1522.5 ,  15510.02,   3431.58,  14953.38,   5437.98, -11653.08,
        15837.78,   3736.5 ,   9528.86,   1018.92,  29927.04,  14801.66,
         4196.46,   4682.4 ,   4087.86,  -4225.9 ,  12684.7 ,   3621.74,
         3206.12,   8211.12,   7832.62])
In [263]:
test_set.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-263-548acbc53724> in <module>()
----> 1 test_set.head()

NameError: name 'test_set' is not defined