from IPython.display import Image
Image("seattle-travel.jpg")
To get a better understanding of the visualization tool folium, this document provides in-depth explanations of it's basic functions:
To get a better understanding of the pandas library, this guide listed below is a quick and effiecient cheat sheet:
The CSV file used containing all the data in this project is from Seattle's official city government site. The site contains citizen, business, and visitor information sections, plus city government information. Specfically, the section used is it's Public Data Sets. In that section, it contains high value, machine-readable datasets created by the City of Seattle available for public use. The file we are using is it's "Crime data set from 2008 to Present." The following process was used to create tidy data. (In tidy data each variable forms a column and each observation forms a row.
!pip install folium
import folium
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from datetime import datetime
import requests #get request
import pandas as pd #pandas
import numpy as np #
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
Retrieving the 2008-Present Seattle Crime Data Set:
url ="https://data.seattle.gov/api/views/tazs-3rd5/rows.csv?accessType=DOWNLOAD"
seattleCrime = pd.read_csv(url)
seattleCrime
(seattleCrime['Crime Against Category']).unique()
(seattleCrime['Offense Parent Group']).unique()
array_1=[]
array_2=[]
for x in range(0,3000):
array_1.append(seattleCrime.loc[x]['Offense Parent Group'])
array_2.append(seattleCrime.loc[x]['Crime Against Category'])
plt.figure(figsize=(25,10))
plt.ylabel("Offense Parent Group")
plt.xlabel("Crime Against Category")
plt.title("Offense Parent Group vs Crime Against Category")
plt.scatter(array_2,array_1)
seattleTidy = pd.read_csv(url)
seattleTidy.dropna(inplace=True)
# drop all the rows with Nan
seattleTidy=seattleTidy.drop("Offense End DateTime",axis=1)
seattleTidy=seattleTidy.drop("Offense Code",axis=1)
# drop the dates and times the crimes ended because many are unknown. Drop the offense code because that is not going to be analyzed.
crimeCounts = sorted(seattleTidy["Offense Parent Group"].unique())
# We are going to store the most populous Offense Parent Groups, AKA the most popular groups crimes commited fall under.
# First pull all unique Offense Parent Groups
crimeCounts2=[]
temporary_Seattledf = seattleTidy.copy()
# create a copy of data to iterate through
index = 0
# Function iterates through every element in the seattle crimes dataset to capture its Offense Parent Group
for ele in crimeCounts:
crimeCounts2.append((ele, temporary_Seattledf.groupby("Offense Parent Group").count()["Offense"][0:].values[index]))
index += 1
crimeCounts2 = sorted(crimeCounts2, key=lambda x: x[1], reverse = True)
crimeCounts2[0:10]
# Now only pull the top 10 Offense Parent Groups
SeattleTop_5_crimes = crimeCounts2[0:5]
SeattleTop_5_crimes
# Save the top 5 most occuring offense parent groups
# Now to begin looking at the times and dates each crime occured. Because the original data set already
# has the date and crime formatted in YEAR-DAY-MONTH TIME, we only need to capture each part in a regex
# so that the elements for the Offense's Start Date Time are easily accessible
time_format = "%Y-%m-%d %H:%M:%S"
# to save the properties of column 'Offense Start DateTime'
# iterates through entire crime data set to save the elements of 'Offense Start DateTime' for easy access later
for idx in seattleTidy.index:
piInString=str(seattleTidy.at[idx,"Offense Start DateTime"])
seattleTidy.at[idx,"Offense Start DateTime"] = datetime.strptime(piInString,time_format)
seattleTidy.at[idx,"Report DateTime"] = datetime.strptime(seattleTidy.at[idx,"Report DateTime"],time_format)
seattleTidy.head()
# Now that we have the time each crime occured easily saved in the regex, we can plot the times each crime occured
# for Seattle's top 5 Offense Parent Groups. This function will be able to show us if most crimes happen during
# the day or during the afternoon or night.
Hourlycrimes = {}
for i in range(len(SeattleTop_5_crimes)):
Hourlycrimes[SeattleTop_5_crimes[i][0]] = []
for idx in seattleTidy.index:
t = seattleTidy.at[idx,"Offense Parent Group"]
if t in Hourlycrimes:
hour = seattleTidy.at[idx, "Offense Start DateTime"].hour
Hourlycrimes[t].append(hour)
for i in Hourlycrimes:
hist, bin_edges = np.histogram(Hourlycrimes[i], bins=24)
plt.hist(Hourlycrimes[i],bins=24,color="red",edgecolor='black', linewidth=1.2)
plt.ylabel("Number of " + i)
plt.xlabel("Hour of Day")
plt.title(i + " Crimes per Hour")
plt.show()
seattleTidy['SEASON OF OCCURENCE'] = None
# initialize the season that the crime occured column to nothing
# Iterate through the data table and we can easily pull from our regex what month the crime occured in
# from the data column "Offense Start DateTime"
for idx in seattleTidy.index:
curr_month = seattleTidy.at[idx, "Offense Start DateTime"].month
# grabs the month
if curr_month == 12 or (1 <= curr_month and curr_month <= 2):
seattleTidy.at[idx,'SEASON OF OCCURENCE'] = "Winter"
# months 12, 1 and 2 are Winter
elif 3 <= curr_month and curr_month <= 5:
seattleTidy.at[idx,'SEASON OF OCCURENCE'] = "Spring"
# months 3, 4, and 5 are Spring
elif 6 <= curr_month and curr_month <= 8:
seattleTidy.at[idx,'SEASON OF OCCURENCE'] = "Summer"
# months 6, 7, and 8 are Summer
elif 9 <= curr_month and curr_month <= 11:
seattleTidy.at[idx,'SEASON OF OCCURENCE'] = "Fall"
# months 9, 10 and 11 are Fall
seattleTidy.head()
seattleTidy['Year Start'] = None
for idx in seattleTidy.index:
curr_year = seattleTidy.at[idx, "Offense Start DateTime"].year
seattleTidy.at[idx,'Year Start'] = curr_year
seattleTidy.head()
Yearlycrimes2 = {}
# new data structure to map years to crimes committed
for i in range(len(SeattleTop_5_crimes)):
Yearlycrimes2[SeattleTop_5_crimes[i][0]] = []
# iterate through
for idx in seattleTidy.index:
t = seattleTidy.at[idx,"Offense Parent Group"]
if t in Yearlycrimes2:
year = seattleTidy.at[idx, "Offense Start DateTime"].year
Yearlycrimes2[t].append(year)
for i in Yearlycrimes2:
hist, bin_edges = np.histogram(Yearlycrimes2[i], bins=24)
plt.hist(Yearlycrimes2[i],bins=24,color="green",edgecolor='black', linewidth=1.2)
plt.ylabel("Number of " + i)
plt.xlabel("Year")
plt.title(i + " Crimes per Year")
plt.show()
seattleTidy
# This function iterates through the data set to capture the season each crime occured in and adds it to the counter
# variable for each of the 4 seasons.
crimeSeason = sorted(seattleTidy["SEASON OF OCCURENCE"].unique())
crimeSeason2 = []
tempSeason_df = seattleTidy.copy()
# make a copy of data set to iterate through
index = 0
for ele in crimeSeason:
crimeSeason2.append((ele, tempSeason_df.groupby("SEASON OF OCCURENCE").count()["Offense"][0:].values[index]))
index += 1
crimeSeason2
lis=crimeSeason2
lis[0], lis[-1] = lis[-1], lis[0]
lis
# The list is re-ordered so that the seasons appear in the correct order correct order by swapping the first and last element of the list.
# Now we need to edit the elements of the list with crimes and seasons mapped so that it can be visualized.
# For each index of the list:
# The tuple with (season,number of crimes) must be turned into a type (string, int) tuple
crimeSeason2Copy=lis
graphOfSeasons = [(elem1, elem2) for elem1, elem2 in crimeSeason2Copy]
for (a,b) in graphOfSeasons:
b=b.item()
print(type(a),type(b))
# Now we can manually create a bar graph and also plot a line graph for seasons vs. crimes committed.
# Here is the code that uses plt functions to plot the bar graph:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Winter', 'Spring', 'Summer', 'Fall']
students = [106899,117248,114099,116887]
ax.bar(langs,students)
plt.title("Crimes Committed Per Season From 2008 to 2020")
plt.xlabel("Season")
plt.ylabel("Number of Crimes Committed")
plt.show()
# Must zip list of (season,crimes) to be able to plot using line graph
plt.figure(figsize=(20,10))
plt.plot(*zip(*crimeSeason2Copy))
plt.title("Crimes Committed Per Season From 2008 to 2020")
plt.xlabel("Season")
plt.ylabel("Number of Crimes Committed")
plt.show()
crime_copy=SeattleTop_5_crimes
testList2 = [(elem1, elem2) for elem1, elem2 in crime_copy]
for (t,w) in testList2:
w=w.item()
print(type(t),type(w))
fig2 = plt.figure(figsize=(20,10))
# adjust image size
ax2 = fig2.add_axes([0,0,1,1])
TypeParentOffenseGroup = ['LARCENY-THEFT', 'BURGLARY/BREAKING & ENTERING', 'MOTOR VEHICLE THEFT', 'DESTRUCTION/DAMAGE/VANDALISM OF PROPERTY','FRAUD OFFENSES']
# 5 most popular parent offense groups
numC = [220201,62254,45195,42639,35439]
# corresponding number of crimes committed for each group
ax2.bar(TypeParentOffenseGroup,numC)
plt.title("Crimes Committed for Top 5 Offense Parent Groups")
plt.xlabel("Offense Parent Group")
plt.ylabel("Number of Crimes Committed")
plt.show()
# lables
plt.figure(figsize=(20,10))
plt.plot(*zip(*testList2))
plt.title("Crimes Committed in Each Parent Offense Group")
plt.xlabel("Offense Parent Group")
plt.ylabel("Number of Crimes Committed")
plt.show()
a=seattleTidy['Offense']
a
crimewCounts = sorted(seattleTidy["Offense"].unique())
crimewCounts2=[]
tempw_df = seattleCrime.copy()
index = 0
for ele in crimewCounts:
crimewCounts2.append((ele, tempw_df.groupby("Offense").count()["Offense Parent Group"][0:].values[index]))
index += 1
crimewCounts2 = sorted(crimewCounts2, key=lambda x: x[1], reverse = True)
crimewCounts2[0:10]
top5Offenses=crimewCounts2[0:5]
top5Offenses
crimes5=top5Offenses
plt.figure(figsize=(20,10))
plt.plot(*zip(*crimes5))
plt.title("Most Common Offenses")
plt.xlabel("Offense")
plt.ylabel("Number Committed")
plt.show()
heatMapCSV=seattleTidy
heatMapCSV.dropna(inplace=True)
heatMapCSV
# Create a copy of data set to use for folium maps
# Because we only care about the crime committed and the longitude and latitude of each crime, many of the prior
# crime information can be dropped to make iterating easier
heatMapCSV=heatMapCSV.drop("Group A B",axis=1)
heatMapCSV=heatMapCSV.drop("Precinct",axis=1)
heatMapCSV=heatMapCSV.drop("Sector",axis=1)
heatMapCSV=heatMapCSV.drop("Beat",axis=1)
heatMapCSV=heatMapCSV.drop("MCPP",axis=1)
heatMapCSV=heatMapCSV.drop("Report DateTime",axis=1)
heatMapCSV=heatMapCSV.drop("Offense ID",axis=1)
heatMapCSV=heatMapCSV.drop("100 Block Address",axis=1)
heatMapCSV=heatMapCSV.drop("SEASON OF OCCURENCE",axis=1)
heatMapCSV=heatMapCSV.drop("Report Number",axis=1)
heatMapCSV.reset_index()
heatMapCSV
# Function that looks at the month pulled from crime start date and assigns the numeric month value to its string value
heatMapCSV['Month of OCCURENCE'] = None
for idx in heatMapCSV.index:
curr_month = heatMapCSV.at[idx, "Offense Start DateTime"].month
if curr_month == 12:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "December"
elif curr_month == 11:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "November"
elif curr_month == 10:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "October"
elif curr_month == 9:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "September"
elif curr_month == 8:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "August"
elif curr_month == 7:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "July"
elif curr_month == 6:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "June"
elif curr_month == 5:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "May"
elif curr_month == 4:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "April"
elif curr_month == 3:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "March"
elif curr_month == 2:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "February"
elif curr_month == 1:
heatMapCSV.at[idx,'Month of OCCURENCE'] = "January"
heatMapCSV.head()
heatPlot=heatMapCSV
heatPlot
# Initial display of Seattle
map_model = folium.Map(location=[47.6062, -122.331], zoom_start=11)
map_model
# Map to display Offense Parent Group crime locations
map_parentGroup=folium.Map(location=[47.6062, -122.331], zoom_start=11)
map_parentGroup
# intialize map to display seasons of crime occurances
map_month = folium.Map(location=[47.6062, -122.331], zoom_start=11)
map_month
# initialize map to display crime against locations
map_crimeAgainst=folium.Map(location=[47.6062, -122.331], zoom_start=11)
map_crimeAgainst
# In order to plot the Latitude Longitude values of the crime data set, the 2 values must be made into a list
locationsCrime = heatMapCSV[['Latitude', 'Longitude']]
locationsCrime
# turns LAT,LONG to list
locationsCrimeList=locationsCrime.values.tolist()
locationsCrimeList
(heatMapCSV['Crime Against Category']).unique()
(heatMapCSV['Month of OCCURENCE']).unique()
(heatMapCSV['Offense']).unique()
# functions checks to see if offense (crime) committed is one of the top 5 most common of all years
def isPopularOffense(offense):
if offense == 'LARCENY-THEFT':
return 'white'
elif offense == 'BURGLARY/BREAKING & ENTERING':
return 'red'
elif offense == 'MOTOR VEHICLE THEFT':
return 'blue'
elif offense == 'DESTRUCTION/DAMAGE/VANDALISM OF PROPERTY':
return 'yellow'
elif offense == 'FRAUD OFFENSES':
return 'green'
# Grabs only the 2020 year values to plot so that crimes in 1 year can be seen
heatPlotIT=heatMapCSV[0:3000]
heatPlotIT
# function that iterates through 2020 crime data set and plots where any LARCENY-THEFT, BURGLARY/BREAKING & ENTERING,
# MOTOR VEHICLE THEFT, DESTRUCTION/DAMAGE/VANDALISM OF PROPERTY, and FRAUD OFFENSES occured
count_LARCENY_THEFT=0
count_BURGLARY_BREAKING_and_ENTERING=0
count_MOTOR_VEHICLE_THEFT=0
count_DESTRUCTION_DAMAGE_VANDALISM_OF_PROPERTY=0
count_FRAUD_OFFENSES=0
for point in heatPlotIT.index:
if heatPlotIT.loc[point]['Offense Parent Group']=='LARCENY-THEFT':
count_LARCENY_THEFT+=1
elif heatPlotIT.loc[point]['Offense Parent Group']=='BURGLARY/BREAKING&ENTERING':
count_BURGLARY_BREAKING_and_ENTERING+=1
elif heatPlotIT.loc[point]['Offense Parent Group']=='MOTOR VEHICLE THEFT':
count_MOTOR_VEHICLE_THEFT+=1
elif heatPlotIT.loc[point]['Offense Parent Group']=='DESTRUCTION/DAMAGE/VANDALISM OF PROPERTY':
count_DESTRUCTION_DAMAGE_VANDALISM_OF_PROPERTY+=1
elif heatPlotIT.loc[point]['Offense Parent Group']=='FRAUD OFFENSES':
count_FRAUD_OFFENSES+=1
map_parentGroup.add_child(folium.CircleMarker(location=locationsCrimeList[point], fill='true',radius = 6,popup= 'Hi', fill_color=isPopularOffense(heatPlotIT.loc[point]['Offense Parent Group']),color = 'clear',fill_opacity=1))
map_parentGroup
# Function that colors the crime against category for crimes committed in 2020
def isCrimeAgainst(group):
if group == 'PROPERTY':
return 'red'
elif group == 'SOCIETY':
return 'green'
elif group == 'PERSON':
return 'blue'
elif group == 'NOT_A_CRIME':
return 'white'
# function that maps the crime against category for crimes committedd in 2020
count_PROPERTY=0
count_SOCIETY=0
count_PERSON=0
count_NotACrime=0
for point in heatPlotIT.index:
if heatPlotIT.loc[point]['Crime Against Category']=='PROPERTY':
count_PROPERTY+=1
elif heatPlotIT.loc[point]['Crime Against Category']=='SOCIETY':
count_SOCIETY+=1
elif heatPlotIT.loc[point]['Crime Against Category']=='PERSON':
count_PERSON+=1
elif heatPlotIT.loc[point]['Crime Against Category']=='NOT_A_CRIME':
count_NotACrime+=1
map_crimeAgainst.add_child(folium.CircleMarker(location=locationsCrimeList[point], fill='true',radius = 6,popup= 'Hi', fill_color=isCrimeAgainst(heatPlotIT.loc[point]['Crime Against Category']),color = 'clear',fill_opacity=1))
map_crimeAgainst
# function that assigns month to seasons to be plotted for crimes committed in 2020
def whatMonth(the_month):
if the_month == 'January':
return 'green'
elif the_month == 'February':
return 'green'
elif the_month == 'March':
return 'yellow'
elif the_month == 'April':
return 'yellow'
elif the_month == 'May':
return 'yellow'
elif the_month == 'June':
return 'white'
elif the_month == 'July':
return 'white'
elif the_month == 'August':
return 'white'
elif the_month == 'September':
return 'blue'
elif the_month == 'October':
return 'blue'
elif the_month == 'November':
return 'blue'
elif the_month == 'December':
return 'green'
# function that maps the season to location for crimes committed in 2020
count_FALL=0
count_WINTER=0
count_SPRING=0
count_SUMMER=0
for point in heatPlotIT.index:
if heatPlotIT.loc[point]['Month of OCCURENCE']=='January':
count_WINTER+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='February':
count_WINTER+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='March':
count_SPRING+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='April':
count_SPRING+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='May':
count_SPRING+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='June':
count_SUMMER+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='July':
count_SUMMER+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='August':
count_SUMMER+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='September':
count_FALL+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='October':
count_FALL+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='November':
count_FALL+=1
elif heatPlotIT.loc[point]['Month of OCCURENCE']=='December':
count_WINTER+=1
map_month.add_child(folium.CircleMarker(location=locationsCrimeList[point], fill='true',radius = 6,popup= 'Hi', fill_color=whatMonth(heatPlotIT.loc[point]['Month of OCCURENCE']),color = 'clear',fill_opacity=1))
map_month