import pandas as pd
import numpy as np
import re

import seaborn as sns
import ipywidgets as widgets
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d, Axes3D

import folium 
from folium import plugins

from statsmodels.graphics.mosaicplot import mosaic
import matplotlib.patches as mpatches
import json
from dateutil.parser import parse
%matplotlib inline

Data Loading

food = pd.read_csv("data/Food_Inspections.csv")

# Find all columns with at least one null value
null_cols=np.sum(food.isnull() , axis=0)
null_cols[null_cols>0]

AKA Name                        2451
License #                         17
Facility Type                   4776
Risk                              72
City                             138
State                             42
Zip                               50
Inspection Type                    1
Violations                     51693
Latitude                         682
Longitude                        682
Location                         682
Historical Wards 2003-2015    194904
Zip Codes                     194904
Community Areas               194904
Census Tracts                 194904
Wards                         194904
dtype: int64

food.shape

(194904, 22)

# Lets plot the nans
fig, ax = plt.subplots(figsize=(16,8))
ax.barh(null_cols.index, null_cols.values)
ax.set_title("Null value")
ax.set_xlabel("Number of nans")

Text(0.5, 0, 'Number of nans')

png

# Data Cleaning & Munging

The columns Historical Wards 2003-2015 , Zip Codes , Community Areas , Census Tracts and Ward are all empty. So we drop them

food.drop(columns=["Historical Wards 2003-2015","Zip Codes", "Community Areas", "Census Tracts", "Wards"],axis=1,inplace=True)

Results

results_count = food["Results"].value_counts(sort=False)
results_count

Pass w/ Conditions       26780
No Entry                  6188
Not Ready                 1859
Fail                     37698
Business Not Located        67
Out of Business          16784
Pass                    105528
Name: Results, dtype: int64

fig, ax = plt.subplots(figsize=(8,8))

colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99','#A9BD5A',"#6FBAaD"]

ax.pie(results_count.values,labeldistance=1.05,colors=colors, labels=results_count.index, autopct='%1.1f%%',
          startangle=0)

plt.title("Repartition of the results")

Text(0.5, 1.0, 'Repartition of the results')

png

We are interested in business with inspections that pass or fail so we retain those

results=["Pass","Fail","Pass w/ Conditions"]
food=food[food["Results"].isin(results)]
food.shape

(170006, 17)

Inspection Date

food["Inspection Date"].head()

  10/25/2019
  10/24/2019
  10/24/2019
  10/24/2019
  10/24/2019
Name: Inspection Date, dtype: object

food["Inspection Date"] = food["Inspection Date"].apply(lambda d: parse(d))

food['Year'] = food["Inspection Date"].map(lambda x: x.year)
food['Month'] = food["Inspection Date"].map(lambda x: x.month)
food['Day'] = food["Inspection Date"].map(lambda x: x.day)

It makes sense to have the dataset in chronological order, so we sort it accordingly

food = food.sort_values(by="Inspection Date")

food.head()

	Inspection ID	DBA Name	AKA Name	License #	Facility Type	Risk	Address	City	State	Zip	Inspection Date	Inspection Type	Results	Violations	Latitude	Longitude	Location	Year	Month	Day
192455	67757	DUNKIN DONUTS/BASKIN-ROBBINS	DUNKIN DONUTS/BASKIN-ROBBINS	1380279.0	Restaurant	Risk 2 (Medium)	100 W RANDOLPH ST	CHICAGO	IL	60601.0	2010-01-04	Tag Removal	Pass	NaN	41.884586	-87.631010	(-87.63101044588599, 41.88458626715456)	2010	1	4
194205	67732	WOLCOTT'S	TROQUET	1992039.0	Restaurant	Risk 1 (High)	1834 W MONTROSE AVE	CHICAGO	IL	60613.0	2010-01-04	License Re-Inspection	Pass	NaN	41.961606	-87.675967	(-87.67596676683779, 41.961605669949854)	2010	1	4
193691	67733	WOLCOTT'S	TROQUET	1992040.0	Restaurant	Risk 1 (High)	1834 W MONTROSE AVE	CHICAGO	IL	60613.0	2010-01-04	License Re-Inspection	Pass	NaN	41.961606	-87.675967	(-87.67596676683779, 41.961605669949854)	2010	1	4
194066	67738	MICHAEL'S ON MAIN CAFE	MICHAEL'S ON MAIN CAFE	2008948.0	Restaurant	Risk 1 (High)	8750 W BRYN WAWR AVE	CHICAGO	IL	60631.0	2010-01-04	License	Fail	18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...	NaN	NaN	NaN	2010	1	4
193882	70269	mr.daniel's	mr.daniel's	1899292.0	Restaurant	Risk 1 (High)	5645 W BELMONT AVE	CHICAGO	IL	60634.0	2010-01-04	License Re-Inspection	Pass	NaN	41.938443	-87.768318	(-87.76831838068422, 41.93844282365204)	2010	1	4

Zip

Since there are only 31 cases with nan zip, we simply drop them

food=food[food["Zip"].notnull()]

zip_=food["Zip"].value_counts().cumsum()/food.shape[0]
zip_=zip_.reset_index()
plt.plot(zip_["Zip"])
plt.title("Zip cumsum")
plt.xlabel("Number of zipcodes")
plt.ylabel("cumulative distribution function")
plt.show()

png

20 Zipcodes out of 110 are where more than half of inspections take place, it’s interesting to check in our further analysis whether these places are just highly dense or are they actually more prone to food inspections

City and State

food[food.State.isnull() & food.City.isnull()]["Address"].unique()

array(['636 N RACINE AVE ', '500 N MILWAUKEE AVE ',
       '506 N Milwaukee AVE ', '1438 W CORTLAND ST '], dtype=object)

All Null values in State and City are in Chicago ( checked with the respective adresses)

food[food.State.isnull() & food.City.notnull()]["City"].unique()

array(['CHICAGO'], dtype=object)

Null values in State with a known city are all in Chicago

food[(food.City.isnull()) & (food["State"]!="IL")]["Address"].unique()

array(['636 N RACINE AVE ', '500 N MILWAUKEE AVE ',
       '506 N Milwaukee AVE ', '1438 W CORTLAND ST '], dtype=object)

All Null values in City are in Chicago ( checked with the respective adresses)

food["State"].fillna("IL", inplace=True)
food=food[food["State"]=="IL"]
food["City"].fillna("Chicago", inplace=True)

food[food["City"]!="CHICAGO"]["City"].unique()

array(['chicago', 'Chicago', 'CCHICAGO', 'CHESTNUT STREET',
       'CHICAGOCHICAGO', 'CHCICAGO', 'CHCHICAGO', 'OAK PARK', 'ALSIP',
       'EVERGREEN PARK', 'CHARLES A HAYES', 'EAST HAZEL CREST', 'MAYWOOD',
       'Maywood', 'WORTH', 'CHicago', 'BROADVIEW', 'BLUE ISLAND',
       'NAPERVILLE', 'FRANKFORT', 'GLENCOE', 'CHICAGOI', 'OAK LAWN',
       'CICERO', 'STREAMWOOD', 'OLYMPIA FIELDS', 'BERWYN', 'LAKE ZURICH',
       'DES PLAINES', 'BURNHAM', 'SUMMIT', 'BOLINGBROOK',
       'COUNTRY CLUB HILLS', 'LOMBARD', 'CHICAGO HEIGHTS', 'TINLEY PARK',
       'JUSTICE', 'OOLYMPIA FIELDS', 'ELK GROVE VILLAGE', 'EVANSTON',
       'CALUMET CITY', 'SKOKIE', 'Norridge', 'ELMHURST', 'BLOOMINGDALE',
       'SCHAUMBURG', 'BANNOCKBURNDEERFIELD', 'BEDFORD PARK', 'BRIDGEVIEW',
       'SCHILLER PARK', 'NILES NILES', '312CHICAGO', 'LAKE BLUFF',
       'CHICAGOHICAGO', 'PALOS PARK', 'PLAINFIELD', 'HIGHLAND PARK',
       'CHICAGO.', 'ROSEMONT', 'WHEATON', 'WADSWORTH', 'LANSING',
       'ALGONQUIN'], dtype=object)

Illinois_zipcode=(60001,62999)
suburbs_zipcode=food[food["City"]!="CHICAGO"]["Zip"].unique()
# All the suburbs are within the Illinois zipcode range
all(Illinois_zipcode[0]<=city_zipcode<=Illinois_zipcode[1] for city_zipcode in suburbs_zipcode)

True

All these values are either typos for Chicago or Chicago suburbs (checked by zip code), so we keep them.

Since the columns City and State don’t give us relevant information, we drop them

food=food.drop(columns=["City","State"])

Location

# Fill the null location values with default Chicago location
default_lat=41.85003
default_long=-87.65005
default_location='(-87.65005,41.85003)'
food[food["Location"]==default_location]

	Inspection ID	DBA Name	AKA Name	License #	Facility Type	Risk	Address	Zip	Inspection Date	Inspection Type	Results	Violations	Latitude	Longitude	Location	Year	Month	Day

No other business has this location, so we can drop it when we need to plot

food["Latitude"].fillna(default_lat,inplace=True)
food["Longitude"].fillna(default_long,inplace=True)
food["Location"].fillna(default_location,inplace=True)

food_plot = food[food["Location"] != default_location]

m = folium.Map([41.85003, -87.65005], zoom_start=10)

sample_locaions = food_plot.sample(30000)[["Latitude", "Longitude"]].values

m.add_child(plugins.HeatMap(sample_locaions.tolist(), radius=10))
m

License number

food[food["License #"].isnull()].shape

(16, 18)

food.groupby("License #").Location.nunique().sort_values(ascending=False).head()

License #
0.0          192
2458393.0      3
2225602.0      2
46041.0        2
2511398.0      2
Name: Location, dtype: int64

Since there are 16 nans, and we see that a lot of unidentified license are set to 0, we set them also to 0

food["License #"].fillna(0,inplace=True)

Violations

As seen in the dataset description:

An establishment can receive one or more of 45 distinct violations (violation numbers 1-44 and 70). For each violation number listed for a given establishment, the requirement the establishment must meet in order for it to NOT receive a violation is noted, followed by a specific description of the findings that caused the violation to be issued.

For the null values in Violations, we will just replace them with “0. None reported” since the violations are numbered

food["Violations"].fillna("0. None Reported", inplace=True)

It’s specified in the Food Inspection Reporting System (https://webapps1.chicago.gov/healthinspection/Code_Violations.jsp#1crit) that we can have 3 types of violation :

-For violation number 1-14 it refers to critical violations.
-For violation number 15-29 it refers to serious violations.
-For the rest it refers to no critical violations,minor (30-44,70). 

For each inspection we have a list of a description of each violation discovered, the description is composed of 3 elements:

-The violation number
-The violation title
-Comments

Therefore we parse the descriptions to separate the 3 entities.

# Initiating the 3 columns
violation_number = []
violation_title = []
violation_comment = []
# Initiating the exploded dataframe with 4 columns : ids, number , title and comments
number_exploded = []
ids_exploded = []
title_exploded = []
comment_exploded = []
    
def violation_parsing(row):
    """
    This function iterates over the Violations columns in the original dataset and returns three seperate columns:
    Violation number, title and comments
    Also, it returns a new dataframe where all seperate violations tuples have unique rows
    """
    
    
    

    # spliting seperate violations
    list_violation= row['Violations'].split(' | ')
    numbers = []
    titles = []
    comments = []
    for vl in list_violation:

        ids_exploded.append(row['Inspection ID'])

        # splitting the title from the comments
        comment_splited = vl.split('- Comments:')

        # splitting the number from the title
        split_point = comment_splited[0].split('.')

        # appending the number
        numbers.append(split_point[0])
        number_exploded.append(split_point[0])

        #appending the title
        titles.append(split_point[1].strip())
        title_exploded.append(split_point[1].strip())

        # checking whether a comment exists and then appending it, otherwise we append an empty comment
        if(len(comment_splited) <= 1):
            comments.append("")
            comment_exploded.append("")
        else:
            comments.append(comment_splited[1].strip())
            comment_exploded.append(comment_splited[1].strip())

    violation_number.append(numbers)
    violation_title.append(titles)
    violation_comment.append(comments)

food.apply(violation_parsing,axis=1)
violation_exploded = pd.DataFrame({"ids": ids_exploded ,"numbers" : number_exploded , "titles" : title_exploded, "comments" : comment_exploded})
food ['Violation numbers'] =violation_number
food ['Violation title']=violation_title
food ['Violation comments']=violation_comment 

# showing all different violation codes
count_violation = violation_exploded.groupby(['numbers']).comments.count()
count_violation.index = count_violation.index.map(lambda x: int(x))
count_violation = count_violation.sort_index()
count_violation.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 70],
           dtype='int64', name='numbers')

We are surprised to find violation numbers that are above 44 ! This is really weird as it was not specified in the Food Inspection Reporting System.

# showing their respective titles
weird_cases = violation_exploded[(violation_exploded['numbers'].astype(int) > 44) & (violation_exploded['numbers'] != "70") ]

weird_cases.groupby(['numbers']).titles.value_counts()

numbers  titles                                                                           
     FOOD HANDLER REQUIREMENTS MET                                                         7475
         SINGLE-USE/SINGLE-SERVICE ARTICLES: PROPERLY STORED & USED                             415
     GLOVES USED PROPERLY                                                                    76
     FOOD & NON-FOOD CONTACT SURFACES CLEANABLE, PROPERLY DESIGNED, CONSTRUCTED & USED     6514
     WAREWASHING FACILITIES: INSTALLED, MAINTAINED & USED; TEST STRIPS                     1669
     NON-FOOD/FOOD CONTACT SURFACES CLEAN                                                  4649
     HOT & COLD WATER AVAILABLE; ADEQUATE PRESSURE                                          281
     PLUMBING INSTALLED; PROPER BACKFLOW DEVICES                                           5526
     SEWAGE & WASTE WATER PROPERLY DISPOSED                                                 374
     TOILET FACILITIES: PROPERLY CONSTRUCTED, SUPPLIED, & CLEANED                          1965
     GARBAGE & REFUSE PROPERLY DISPOSED; FACILITIES MAINTAINED                              849
     PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN                                    15320
     ADEQUATE VENTILATION & LIGHTING; DESIGNATED AREAS USED                                4656
     ALL FOOD EMPLOYEES HAVE FOOD HANDLER TRAINING                                         3299
     ALLERGEN TRAINING AS REQUIRED                                                         6277
     PREVIOUS PRIORITY FOUNDATION VIOLATION CORRECTED                                       299
     PREVIOUS CORE VIOLATION CORRECTED                                                      712
     SUMMARY REPORT DISPLAYED AND VISIBLE TO THE PUBLIC                                     246
     COMPLIANCE WITH CLEAN INDOOR AIR ORDINANCE                                              39
     REMOVAL OF SUSPENSION SIGN                                                              13
Name: titles, dtype: int64

colors = ["Purple"] + ["red"] * 14 + ["orange"] * 15 + ["yellow"] *15 + ["green"] *19 + ["yellow"]
df_count = pd.DataFrame({'number' : count_violation.index, 'count' :  count_violation.values, 'color' : colors} )

# plotting violations frequencies
fig, ax = plt.subplots(figsize=(10,10))
ax.barh(df_count['number'] ,df_count['count'], color = df_count['color'])

UK = mpatches.Patch(color='purple', label='Unknown')
CR = mpatches.Patch(color='red', label='Critical')
SR = mpatches.Patch(color='orange', label='Serious')
MI = mpatches.Patch(color='yellow', label='Minor')
DS = mpatches.Patch(color='green', label='Discovered')
plt.legend(handles=[UK,CR,SR,MI,DS], loc=1)
plt.xlabel("Number of violation reported")
plt.ylabel("Violation number")
plt.title(" Number of violation reported per violation number")
plt.show()

png

For each inspection we consider a score that counts the number of violations per type of violation (minor,serious,critical).

The new violation numbers (44-63) are grouped with the minor violations.

def score(numbers):
    """ This function creates a score for the violation according to its category (minor,serious,critical)
    and returns the score as a tuple
    """
    minor = 0
    serious = 0
    critical = 0
    for number in numbers:
        number = int(number)
        if( (number >=1) & (number <=14) ):
            critical+=1
        elif ((number >=15) and (number <=29) ) :
            serious+=1
        elif (number >29) :  
            minor+=1
    return minor,serious,critical

food['Violation score minor']    = food ['Violation numbers'].map(lambda x : score(x)[0])
food['Violation score serious']  = food ['Violation numbers'].map(lambda x : score(x)[1])
food['Violation score critical'] = food ['Violation numbers'].map(lambda x : score(x)[2])

Depending on that score we want to know how the results evolve.

#group by score and count the the number of inspection 
violation_score = food.groupby(['Violation score minor','Violation score serious','Violation score critical']).Results.value_counts().unstack().reset_index()
violation_score.fillna(0, inplace=True)

#transforming the count to an empirical probability
sums = violation_score.loc[:, ['Fail','Pass','Pass w/ Conditions']].sum(axis=1).values.reshape((-1,1))
violation_score.loc[:,['Fail','Pass','Pass w/ Conditions']]= violation_score.loc[:,['Fail','Pass','Pass w/ Conditions']] /sums

violation_score.head()

Results	Violation score critical	Fail	Pass	Pass w/ Conditions
0	0	0.113151	0.870763	0.016086
1	1	0.361307	0.042403	0.596290
2	2	0.114050	0.019835	0.866116
3	3	0.205479	0.000000	0.794521
4	4	0.300000	0.000000	0.700000

#We create a 3D space representing the score space and we plot  the probability to fail in that space
%matplotlib notebook
fig = plt.figure(figsize=(10,5))
ax = Axes3D(fig)

p=ax.scatter3D(violation_score['Violation score minor'], violation_score['Violation score serious'], violation_score['Violation score critical'], c=violation_score['Fail'], cmap='YlOrRd',s=30)

ax.set_xlabel('Violation score minor')
ax.set_ylabel('Violation score serious')
ax.set_zlabel('Violation score critical')
ax.set_title("Probaiblity to fail with the number of failed violations")

fig.colorbar(p,label="Probability to fail")

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

We can observe in this 3D plot the probability to fail depending on the number of violations per category. Depending on the axis we can observe that the increase of the violations has a different impact on the probability to fail. The violation score minor doesn’t affect the probability to fail where as when we start to increase just a little bit the serious or critical score the probability to fail becomes very close to 1.

#plotting the probability to pass in the space of the score 

fig = plt.figure(figsize=(10,5))

ax = Axes3D(fig)

p = ax.scatter3D(violation_score['Violation score minor'], violation_score['Violation score serious'], violation_score['Violation score critical'], c=violation_score['Pass'], cmap='YlOrRd',s=40)

ax.set_xlabel('Violation score minor')
ax.set_ylabel('Violation score serious')
ax.set_zlabel('Violation score critical')
ax.set_title("Probaiblity to pass with the number of failed violations")

fig.colorbar(p,label="Probability to pass")

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

We observe that the violation score minor doesn’t affect the probability to pass. We also see that there is a high probability to pass if there are very few serious and critical violation scores

#plotting the probability to pass with condition in the space of the score 

fig = plt.figure(figsize=(10,5))

ax = Axes3D(fig)

p = ax.scatter3D(violation_score['Violation score minor'], violation_score['Violation score serious'], violation_score['Violation score critical'], c=violation_score['Pass w/ Conditions'], cmap='YlOrRd',s=40)

ax.set_xlabel('Violation score minor')
ax.set_ylabel('Violation score serious')
ax.set_zlabel('Violation score critical')
ax.set_title("Probaiblity to Pass w/ Conditions with the number of failed violations")

fig.colorbar(p,label="Probability to pass with condition")
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In this plot we can see that if you have low scores you will not pass with condition because you will just pass and if you have high scores you will fail. We can observe that if you have a lot of minor violations you have a high probability to pass with conditions .

food.groupby('Results')['Violation score minor'].mean()

Results
Fail                  3.941411
Pass                  2.492176
Pass w/ Conditions    3.540392
Name: Violation score minor, dtype: float64

food.groupby('Results')['Violation score serious'].mean()

Results
Fail                  1.156398
Pass                  0.003592
Pass w/ Conditions    0.590140
Name: Violation score serious, dtype: float64

food.groupby('Results')['Violation score critical'].mean()

Results
Fail                  0.719312
Pass                  0.004350
Pass w/ Conditions    1.231335
Name: Violation score critical, dtype: float64

Facility Type

food["Facility Type"]=food["Facility Type"].str.casefold()

# We can check different grocery store types and set them all to "grocery store"
groceries=food[["grocer" in  str(facility) for facility in food["Facility Type"]]]["Facility Type"]
groceries.unique()

array(['grocery store', 'grocery& restaurant', 'slaughter house/ grocery',
       'grocery/cafe', 'gas station/ grocery store',
       'grocery(sushi prep)', 'pharmacy/grocery', 'grocery and butcher',
       'grocery/butcher', 'grocery/restaurant', 'grocery store/bakery',
       'drug/grocery store', 'grocery store/taqueria', 'grocery/liquor',
       'grocery/taqueria', 'gas station /grocery', 'restaurant/grocery',
       'grocery & restaurant', 'restaurant/grocery store',
       'grocery store/cooking school', 'grocery/ restaurant',
       'rest/grocery', 'grocery store/ restaurant',
       'grocery store/gas station', 'drug store/grocery', 'grocery',
       'grocery/dollar store', 'grocery/service gas station',
       'gas station/store grocery', 'grocery/bakery',
       'dollar store selling grocery', 'grocery store/deli',
       'grocery store / gas station', 'bakery/grocery',
       'grocery/gas station', 'grocery/tavern', 'grocery/liquor store',
       'grocery/drug store', 'dollar & grocery store',
       'deli/grocery store', 'grocery(gas station)', 'grocery/deli',
       'gas station/grocery', 'dollar store with grocery',
       'grocery store /pharmacy'], dtype=object)

food.loc[["grocer" in  str(facility) for facility in food["Facility Type"]],"Facility Type"]="grocery store"

fvc=food["Facility Type"].value_counts()
fvc[fvc>20].head(10)

restaurant                         114651
grocery store                       22308
school                              11720
children's services facility         2864
bakery                               2508
daycare (2 - 6 years)                2373
daycare above and under 2 years      2193
long term care                       1277
catering                              988
mobile food dispenser                 795
Name: Facility Type, dtype: int64

print("We have {0} other categories for a total number of {1} businesses".format(fvc[fvc<20].shape[0], fvc[fvc<20].sum()))

We have 330 other categories for a total number of 1481 businesses

fvc[fvc<20].head(20)

after school program                 19
roof top                             19
mobile desserts vendor               18
hotel                                18
nursing home                         18
paleteria                            18
wrigley roof top                     17
gas station/mini mart                17
rooftops                             16
supportive living                    16
brewery                              16
fitness center                       15
pop-up establishment host-tier ii    15
university cafeteria                 15
theatre                              15
long term care facility              15
mobile food truck                    14
wrigley rooftop                      14
health/ juice bar                    14
airport lounge                       14
Name: Facility Type, dtype: int64

We don’t see a proper way to aggregate these outliers, so it might be interesting to set them to other

lower_than_20 = fvc[fvc<20].index

food.loc[food["Facility Type"].isin(lower_than_20),"Facility Type"] ="other" 

For the null values in Facility Type, we will just replace them with “Unspecified”

food["Facility Type"].fillna("Unspecified", inplace=True)

Now we can see a proper distribution for the top 10 facilities

%matplotlib inline
top10_facilities=food["Facility Type"].value_counts().head(10)
fig1, ax1 = plt.subplots(figsize=(8,8))
theme = plt.get_cmap('Paired')
ax1.set_prop_cycle("color", [theme(1. * i / 10)
                             for i in range(10)])
ax1.set_title("Top 10 most inspected facility types")
ax1.pie(top10_facilities.values,labeldistance=1.1, labels=top10_facilities.index, autopct='%1.1f%%',
          pctdistance=0.9, startangle=0)

plt.show()

png

Risk

food["Risk"].value_counts()

Risk 1 (High)      125149
Risk 2 (Medium)     32808
Risk 3 (Low)        11976
All                     5
Name: Risk, dtype: int64

5 entries are marked as Risk : ALL . As we don’t know the meaning of it and that those entries contain many null values, We choose to drop these entries

food=food[food["Risk"]!="All"].copy()

food[food["Risk"].isnull()]["Inspection ID"].count()

34 entries in Risk are null, since there is no appropiate way to impute them, we choose to drop them

food=food[food["Risk"].notnull()]

Let’s try to understand how risk is distributed

risk_count=food["Risk"].value_counts()
fig, ax = plt.subplots(figsize=(8,8))
ax.set_title("Risk percentage")

ax.pie(risk_count.values,labeldistance=1.1,colors=["red","orange","yellow"], labels=risk_count.index, autopct='%1.1f%%',
          pctdistance=0.9, startangle=0)

plt.show()

png

Let’s see their correlation with the results of inspections

fig, ax = plt.subplots(figsize=(12,8))

props = lambda key: {'color': "yellowgreen" if "Pass" in key else "tomato" if "Fail" in key else "gold"}
mosaic(food,["Risk","Results"], ax=ax ,properties=props)
plt.title("Mosaic of Risk and Results categories")
plt.show()

png

We see that the pass percentage doesn’t really depend on the risk type, although facilities with low risk have slightly more chance to fail

Let’s observe the evolution of the risk thought time.

year_inspection_risk = pd.DataFrame(food.groupby(['Year','Risk'])['Inspection ID'].count()).reset_index()

year_inspection_risk = year_inspection_risk.pivot(index='Year', values='Inspection ID', columns='Risk')

year_inspection_risk.fillna(0, inplace=True)

year_inspection_risk

Risk	Risk 1 (High)	Risk 2 (Medium)	Risk 3 (Low)
Year
2010	11662	3773	1922
2011	11715	4018	1834
2012	11162	3272	1576
2013	12160	3622	1280
2014	13848	3603	1163
2015	13429	3451	1009
2016	15193	3835	831
2017	14420	3200	875
2018	11814	2264	770
2019	9746	1770	716

fig, ax = plt.subplots(figsize=(10,10))

year_inspection_risk.plot(kind='bar',stacked=True,figsize=(8,8),ax=ax )

plt.legend(loc="lower left", bbox_to_anchor=(1,0))
ax.set_title("The distribution of inspection risk by year",size=20)
ax.set_xlabel('Year')
ax.set_ylabel('Number')

Text(0, 0.5, 'Number')

png

fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='Longitude', y='Latitude', hue='Risk' , data=food, ax=ax, palette = ["blue","red","green"])
ax.set_title("The distribution of inspections by risk",size=20)
ax.set_xlabel('Longitude')
ax.set_ylabel('LATITUDE')
plt.show()

png

We can’t really see a general pattern let’s compute the mean risk per zipcode.

food['Risk number'] = food['Risk'].map(lambda x : int(str(x).split(' ')[1]))
mean_risk_zip = food.groupby('Zip').mean()['Risk number']

fig, ax = plt.subplots(figsize=(5,5))
mean_risk_zip.hist(bins=20)
ax.set_title("Distribution of the risk mean per zipcode")
ax.set_xlabel("Risk mean")
ax.set_ylabel("Distribution")
plt.show()

df_risk_zip = pd.DataFrame({'Zip': mean_risk_zip.index.astype(int).astype(str), 'Risk mean' :mean_risk_zip.values} )

png

We draw the results on the map using GeoJson zipcode file.

with open("data/Boundaries-ZIP-Codes.geojson") as json_file:
    zipcodes = json.load(json_file)
    
m = folium.Map(location=[41.85003,-87.6], tiles='cartodbpositron', zoom_start=10.3)

folium.GeoJson(zipcodes).add_to(m)
m.save("initial_zip.html")
m

This is the map of chicago divided by zipcode

m1 = folium.Map(location=[41.85003,-87.6], tiles='cartodbpositron', zoom_start=10.3)

folium.Choropleth(
    geo_data=zipcodes,
    data=df_risk_zip,
    columns=['Zip', 'Risk mean'],
    key_on='feature.properties.zip',
    fill_color='RdYlBu',
    highlight=True,
    fill_opacity=0.7,
    legend_name="Risk Mean",
    show=False,
).add_to(m1)

m1.save("zip_risk_mean.html")
m1

We now look at the proportion of fail, pass and pass wtih condition per zip code.

food.Results.value_counts()

Pass                  105499
Fail                   37659
Pass w/ Conditions     26775
Name: Results, dtype: int64

food.groupby('Zip').Results.count().sort_values(ascending = False)

Zip
0    6167
0    5922
0    5713
0    5623
0    5327
           ... 
0       1
0       1
0       1
0       1
0       1
Name: Results, Length: 107, dtype: int64

# we group by zip and count the number of fail/pass that we normalize after
dist_zip = food.groupby('Zip').Results.value_counts().unstack().reset_index().fillna(0)

# We sum up the groups

sums = dist_zip.loc[:, ['Fail','Pass','Pass w/ Conditions']].sum(axis=1).values.reshape((-1,1))
# We normalize them

dist_zip.loc[:,['Fail','Pass','Pass w/ Conditions']]= dist_zip.loc[:,['Fail','Pass','Pass w/ Conditions']] /sums

# We count the number of samples to have an idea about the significance of our statistics
dist_zip['# of samples'] = food.groupby('Zip').Results.count().values
dist_zip.Zip = dist_zip.Zip.map(lambda x : str(int(x)) ) 
dist_zip.sort_values(ascending = False , by= "# of samples")

Results	Zip	Fail	Pass	Pass w/ Conditions	# of samples
55	60614	0.226042	0.568672	0.205286	6167
87	60647	0.241810	0.600304	0.157886	5922
95	60657	0.222825	0.593559	0.183616	5713
52	60611	0.165926	0.649475	0.184599	5623
63	60622	0.237094	0.615919	0.146987	5327
...	...	...	...	...	...
14	60148	0.000000	1.000000	0.000000	1
29	60438	0.000000	1.000000	0.000000	1
16	60155	0.000000	1.000000	0.000000	1
27	60423	0.000000	1.000000	0.000000	1
18	60189	0.000000	0.000000	1.000000	1

107 rows × 5 columns

dist_zip["Fail"].sort_values()

  0.000000
  0.000000
  0.000000
  0.000000
  0.000000
        ...   
   0.500000
   0.500000
  0.666667
  0.750000
  0.800000
Name: Fail, Length: 107, dtype: float64

m2 = folium.Map(location=[41.85003,-87.6], tiles='cartodbpositron', zoom_start=10.3)

folium.Choropleth(
    geo_data=zipcodes,
    data=dist_zip,
    columns=['Zip', 'Fail'],
    key_on='feature.properties.zip',
    fill_color='Reds',
    highlight=True,
    fill_opacity=0.7,
    legend_name="Fail probability",
    show=True,
).add_to(m2)

m2.save("zip_fail.html")
m2

m3 = folium.Map(location=[41.85003,-87.6], tiles='cartodbpositron', zoom_start=10.3)

folium.Choropleth(
    geo_data=zipcodes,
    data=dist_zip,
    columns=['Zip', 'Pass'],
    key_on='feature.properties.zip',
    fill_color='RdYlGn',
    highlight=True,
    fill_opacity=0.7,
    legend_name="Pass probability",
    show=False,
).add_to(m3)

m3.save("zip_pass.html")
m3

m4 = folium.Map(location=[41.85003,-87.6], tiles='cartodbpositron', zoom_start=10.3)

folium.Choropleth(
    geo_data=zipcodes,
    data=dist_zip,
    columns=['Zip', 'Pass w/ Conditions'],
    key_on='feature.properties.zip',
    fill_color='Blues',
    highlight=True,
    fill_opacity=0.7,
    legend_name="Pass with conditions probability",
    show=False,
).add_to(m4)

m4.save("zip_pass_conditions.html")
m4

Inspection type

There only is one nul value for Inspection type, we drop it

food = food[food["Inspection Type"].notnull()]

An inspection can be one of the following types:

-canvass, the mostcommon type of inspection performed at a frequency relative to the risk of the establishment;

-consultation, when the inspection is done at the request of the owner prior to the opening of the establishment; 

-complaint, when the inspection is done in response to a complaint against the establishment;

-license, when the inspection is done as a requirement for the establishment to receive its license to operate;

-suspect food poisoning, when the inspection is done in response to one or more persons claiming to have gotten ill as a result of eating at the establishment (a specific type of complaint-based inspection);

-task-force inspection, when an inspection of a bar or tavern is done.

Re-inspections can occur for most types of these inspections and are indicated as such.

food["Inspection Type"].nunique()

We observe that we have several inspection’s types representing the same thing and thus we can regroup them.

food["Inspection Type exact"] = food["Inspection Type"].copy()

# Hardocde grouping of inspection types
def inspection_cleaning(x):
    
    x=str(x).lower()
    x=re.sub("re-","re",x)
    if ('reinspection' in x ):
        return 'reinspection'
    elif ('sfp' in x or 'fbi' in x or 'sick' in x or 'poisoning' in x ):
        x= 'food poisoning' 
    elif ('canvas' in x ):
        x= 'canvass' 
    elif ('complaint' in x ):
        x="complaint"
    elif ('license' in x ):
        x="license"
    elif ('task' in x ) :
        x='task force'
    
    return x
food["Inspection Type"] = food["Inspection Type"].apply(inspection_cleaning)

food["Inspection Type"][0]

'reinspection'

Staphylococcal food poisoning (SFP) is one of the most common food-borne diseases worldwide.

The FBI or Fungus, Bacteria, and Invertebrates are decomposers.

food["Inspection Type"].nunique()

kept = ["canvass", "reinspection", "license", "complaint", "suspected food poisoning", "consultation", "tag removal", "recent inspection", "out of business", "task force", "food poisoning"] 

food["Inspection Type"] = food["Inspection Type"].map(lambda x: x if (x in kept ) else 'other')

Now let’s see their distribution

inspection_count=food["Inspection Type"].value_counts()
fig ,ax=plt.subplots(figsize=(16,8))
ax.set_title("Number of inspections by type")
ax.set_xlabel("Number of inspections")
ax.set_ylabel("Inspection types")
_=ax.barh(inspection_count.index,inspection_count.values)

png

# We count the number pf inspections by type and year
year_inspection_type = pd.DataFrame(food.groupby(['Year','Inspection Type'])['Inspection ID'].count()).reset_index()

year_inspection_type = year_inspection_type.pivot(index='Year', values='Inspection ID', columns='Inspection Type')

year_inspection_type.fillna(0, inplace=True)

year_inspection_type.head()

Inspection Type	canvass	complaint	consultation	food poisoning	license	other	out of business	recent inspection	reinspection	tag removal	task force
Year
2010	7006.0	2920.0	236.0	233.0	2705.0	149.0	285.0	22.0	3437.0	258.0	106.0
2011	7548.0	2880.0	136.0	145.0	2642.0	40.0	1.0	20.0	3680.0	331.0	144.0
2012	6845.0	2743.0	74.0	101.0	2742.0	14.0	0.0	2.0	3462.0	6.0	21.0
2013	8834.0	2198.0	60.0	52.0	2554.0	4.0	0.0	12.0	3344.0	3.0	1.0
2014	10106.0	2261.0	71.0	44.0	2375.0	4.0	0.0	31.0	3720.0	2.0	0.0

fig, ax = plt.subplots(figsize=(10,10))

year_inspection_type.plot(kind='bar',stacked=True,figsize=(8,8),ax=ax )

plt.legend(loc="lower left", bbox_to_anchor=(1,0))
ax.set_title("The distribution of inspection type by year",size=14)
ax.set_xlabel('Year')
ax.set_ylabel('Number')

plt.show()

png

fig, ax = plt.subplots(figsize=(10,5))
                       
plt.stackplot(year_inspection_type.index, year_inspection_type['canvass'], year_inspection_type['complaint'], 
              year_inspection_type['consultation'], year_inspection_type['food poisoning'], year_inspection_type['license'],
              year_inspection_type['other'], year_inspection_type['out of business'], year_inspection_type['recent inspection'], 
              year_inspection_type['reinspection'], year_inspection_type['tag removal'], year_inspection_type['task force'],)

plt.legend(loc="lower left", bbox_to_anchor=(1,0),labels=year_inspection_type.columns)
ax.set_title("The distribution of inspection type by year",size=14)
ax.set_xlabel('Year')
ax.set_ylabel('Number')

plt.show()

png

We want to focus more on the food poisoning cases:

Food Poisoning

from ipywidgets import interact
food_poisoning_cases= food[food['Inspection Type'] == 'food poisoning']
def plot_(ticker):
    
    fig, ax = plt.subplots(figsize=(10,10))
    
    sns.scatterplot(x='Longitude', y='Latitude', 
                    data=food_poisoning_cases [food_poisoning_cases["Year"]==ticker],ax=ax)
    ax.set_title("The distribution of food poisoning inspection by year",size=20)
    ax.set_xlabel('Longitude')
    ax.set_ylabel('LATITUDE')
    ax.set_ylim([41.6,42.1])
    ax.set_xlim([-88,-87.5])
    
interact(plot_,ticker=widgets.IntSlider(min=2010,max=2019,step=1))

interactive(children=(IntSlider(value=2010, description='ticker', max=2019, min=2010), Output()), _dom_classes…

<function __main__.plot_(ticker)>

year_inspection_type['food poisoning'].plot(figsize=(10,5))
plt.title("Distribution of the food poisoning per year")
plt.xlabel("Years")
plt.ylabel("# food poisoning")
plt.show()

png

The number of inspections for food poisoning is interesting and give us some informations. Indeed, from 2010 to 2014 we observe that while the total number of food inspections increases by a factor of 1.2, there is a net decrease (by a factor of 5) of the food poisoning inspections.

So what did change, several hypohtesis are possible :

-H0: The detection algorithm improved so that from the number of inspections needed is smaller
-H1: The inspection had a direct effect and the bad restaurants were closed 

food['Fail'] = food['Results'] == "Fail"
food['Pass'] = food['Results'] == "Pass"
food['Pass w/ Conditions'] =  food['Results'] == "Pass w/ Conditions"

fig = plt.figure(figsize=(15,5))

ax1 = fig.add_subplot(121) 
ax2 = fig.add_subplot(122) 

failures = food.groupby("Inspection Type").sum()['Fail'].copy()
successes = food.groupby("Inspection Type").sum()['Pass'].copy()
condtions = food.groupby("Inspection Type").sum()['Pass w/ Conditions'].copy()

group1 = ['canvass', 'complaint', 'license', 'reinspection','']
group2 = ['consultation' ,'food poisoning', 'recent inspection', 'tag removal', 'task force','other','']

failures[group1].plot(kind ='bar',ax=ax1, color='r',position=0,width=0.3,legend=True)
condtions[group1].plot(kind ='bar', ax=ax1, color='y',position=1,width=0.3,legend=True)
successes[group1].plot(kind ='bar', ax=ax1, color='g',position=2,width=0.3,legend=True)

ax1.set_title("Distribution of results per inspection type ",size=14)
ax1.set_xlabel('Inspection type')
ax1.set_ylabel('Number of result')

failures[group2].plot(kind ='bar',ax=ax2, color='r',position=0,width=0.3,legend=True)
condtions[group2].plot(kind ='bar', ax=ax2, color='y',position=1,width=0.3,legend=True)
successes[group2].plot(kind ='bar', ax=ax2, color='g',position=2,width=0.3,legend=True)

ax2.set_title("Distribution of results per inspection type ",size=14)
ax2.set_xlabel('Inspection type')
ax2.set_ylabel('Number of result')

plt.show()

C:\Users\hm.haitham\Anaconda3\lib\site-packages\pandas\core\series.py:1152: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]

png

We are intersted in how food poisoning evolve through time.

food_poisoning= food[food['Inspection Type'] == 'food poisoning']
categories_food_poisonning= food_poisoning.groupby('Year').sum()[['Fail', 'Pass' , 'Pass w/ Conditions']]
total_numbers= categories_food_poisonning.sum(axis=1)
categories_food_poisonning_norm = categories_food_poisonning/total_numbers.values.reshape((-1,1))

categories_food_poisonning_norm.plot(kind='bar', color= ('r','g', 'y'), figsize = (15,5))
plt.title("Distribution of results of food poisoning inspection per year")
plt.ylabel("Distribution")

plt.show()

png

Adress

food["Address"]=food["Address"].str.strip()

Names

The null values in AKA Name can be replaced with their correspending BDA name

food.loc[food["AKA Name"].isnull(),"AKA Name"] = food.loc[food["AKA Name"].isnull(),"DBA Name"]

# casefold
food["DBA Name"]=food["DBA Name"].str.casefold()
food["AKA Name"]=food["AKA Name"].str.casefold()
# remove anything within paranthesis
food["AKA Name"]=food["AKA Name"].apply(lambda x: re.sub(r'\([^)]*\)', '', x ))
# Let's inspect joint businesses, these have a "/" in their AKA Name
joint=food[["/" in aka for aka in food["AKA Name"]]]["AKA Name"].unique()
joint[0:20]

array(['dunkin donuts/baskin-robbins', 'jewel / osco',
       'south side fuel ctr/dunkin doughnuts',
       'dunkin donuts / baskin robbins', 'catering/main kitchen',
       'dunkin donuts / baskin robins', 'montessori acdy. inft/tod. cnt',
       'cvs/pharmacy #2981', 'kfc/long john silver', 'mayne/act one',
       'italian village / vivere', 'mayne stage/act one cafe',
       'northwest middle/belmont-cragin',
       'collins academy/north lawndale college prep',
       'legno/suparossa carry out', 'am/pm',
       'dunkin  donuts / baskin robbins', 'solo cup co./aramark services',
       'pink monkey / new york strip', "kmart/little caesar's"],
      dtype=object)

# Lots of them are in the form "cvs/pharmacy #xxxxx" so we can fix this 
cvs=food[[aka.startswith("cvs") for aka in food["AKA Name"]]]["AKA Name"].index
food.loc[cvs,"AKA Name"]="cvs"

Let’s check if we still have nan values:

null_cols=np.sum(food.isnull() , axis=0)
null_cols[null_cols>0]

Series([], dtype: int64)

<p style="text-align: center;">Analysing the Yelp Dataset</p>

<p style="text-align: center;"></p>

1. Description of the dataset

Yelp is a business directory service and crowd-sourced review forum, it focuses on publishing crowd-sourced reviews about businesses.

To leverage useful data from the Yelp website, we use a combination of Yelp’s API - Yelp Fusion and web scraping in order to get the reviews of the restaurants in Chicago that have been labeled with “ food poisoning “ during food inspections.

a. Data collection

We started by looking into the public dataset offered by Yelp itself in https://www.yelp.com/dataset but unfortunately it didn’t contain info about businesses in Chicago. So we decided to take matters in our hands and extract our own dataset.

We started by getting all the businesses that have been labeled with “ food poisoning “ during inspections from the Chicago Food Inspection dataset, from that list of businesses we requested the Yelp API Match Endpoint with the name of each business and its address to get the unique Yelp alias of the business. In Yelp every business has both a unique ID, such as 4kMBvIEWPxWkWKFN__8SxQ, as well as a unique alias, such as yelp-san-francisco. Generally, business ID and business alias are interchangeable methods of identifying a Yelp Business. The business alias is more human-readable, but often longer, than the business ID.

From that previously obtained alias, we query the website for https://www.yelp.com/biz/alias to obtain all the reviews of a restaurant. That step is costly in terms of time spent since it is easy to get blacklisted from yelp’s website. We use different proxies and user agents to gather our data from the website.

The dataset shown here is just a sample of the entire dataset that we will submit for the next milestone, getting all the reviews we want takes time so for now we will do our analysis on 4130 reviews from 123 restaurants that are affected by food poisoning.

We also analyze the dataset containing said restaurants and information about those according to yelp.

b. Data description

Here is a brief description in each column of the restaurants dataset :

Name	Type	Description
Address	String	Address of the establishment according to our food inspection dataset
Lincense #	int	License number of the establishment according to food inspection dataset
Name	String	Name of the establishment
alias	String	Unique Yelp alias of this business.
id	String	Unique Yelp id of this business
is_closed	bool	Wether the business has been (permanently) closed
overall_rating	float	Rating for this business (value ranges from 1, 1.5, … 4.5, 5).
price	string	Price level of the business. Value is one of `$` , `$$`, `$$$`, `$$$$`
review_count	int	Number of reviews for this business.

The columns for the review dataset is pretty self explanatory, it contains the body of the review, the author of the review, the date for when the review was issued, the rating of the restaurant and a unique identifier for the restaurant ( alias ).

2. Loading the data

# all useful imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS # this module is for making wordcloud in python
import re # regular expression
import string # for finding punctuation in text
import nltk # preprocessing text
from collections import Counter 
from textblob import TextBlob

#Import dataset from previously obtained CSV file
restaurants_food_poisoning_reviews = pd.read_csv("data/sample_food_poisoning.csv",index_col=0).drop_duplicates()
restaurants_food_poisoning_reviews.head()

	reviewRating	datePublished	description	author	alias
0	{'ratingValue': 1}	2019-09-06	Grabbed a takeout order tonight on a Friday, p...	Paul S.	picante-taqueria-chicago-2
1	{'ratingValue': 1}	2019-08-13	I almost passed out eating here when I found a...	Mariam R.	picante-taqueria-chicago-2
2	{'ratingValue': 1}	2019-07-28	First time we ever have gone to Picante and wa...	Maria Guadalupe V.	picante-taqueria-chicago-2
3	{'ratingValue': 1}	2019-05-31	Bland. Customer service needs work. Offensive ...	Dana H.	picante-taqueria-chicago-2
4	{'ratingValue': 1}	2019-04-22	Steak burrito ruined my day, stay away! Not t...	Daniel C.	picante-taqueria-chicago-2

#Shape of the dataset
restaurants_food_poisoning_reviews.shape

(4130, 5)

# Importing second dataset including info about the restaurants without the reviews
restaurants_food_poisoning = pd.read_csv("data/restaurant_aliases_food_poisoning_withrating.csv",index_col=0).drop_duplicates()
restaurants_food_poisoning.head()

	Address	License #	Name	alias	id	overall_rating	price	review_count
0	2018 W DIVISION ST	2042823.0	picante taqueria	picante-taqueria-chicago-2	EMj-Ke8PAxA0ppWbcsirAg	3.5	$	336.0
1	2032-2036 W Devon AVE	2423912.0	ghareeb nawaz restaurant	ghareeb-nawaz-chicago	sYzkzjM2upp2vW32vPaJ9w	4.0	$	807.0
2	7723 S STATE ST	2583156.0	jerk taco man	jerk-taco-man-chicago-5	aInJI0-1j0L1f5BvVgoE4g	2.5	NaN	47.0
3	1676 W OGDEN AVE	2534801.0	taqueria el rey del taco fashion 1676 inc	su-taqueria-el-rey-del-taco-chicago-5	CeoR8-qNyXf5Rw--IUP3yg	4.5	NaN	20.0
4	4337 N LINCOLN AVE	45189.0	the northman	the-northman-chicago-2	XWbMHh7WQRu7tEZzIRKEgg	4.5	$$	235.0

#Shape of the dataset
restaurants_food_poisoning.shape

(94, 9)

2. Cleaning the data

# Extract rating from reviewRating then change the type of the column to int
restaurants_food_poisoning_reviews["reviewRating"]=restaurants_food_poisoning_reviews["reviewRating"].str.extract('(\d+)')
restaurants_food_poisoning_reviews = restaurants_food_poisoning_reviews.astype({'reviewRating': 'int32'})
restaurants_food_poisoning_reviews.head()

	reviewRating	datePublished	description	author	alias
0	1	2019-09-06	Grabbed a takeout order tonight on a Friday, p...	Paul S.	picante-taqueria-chicago-2
1	1	2019-08-13	I almost passed out eating here when I found a...	Mariam R.	picante-taqueria-chicago-2
2	1	2019-07-28	First time we ever have gone to Picante and wa...	Maria Guadalupe V.	picante-taqueria-chicago-2
3	1	2019-05-31	Bland. Customer service needs work. Offensive ...	Dana H.	picante-taqueria-chicago-2
4	1	2019-04-22	Steak burrito ruined my day, stay away! Not t...	Daniel C.	picante-taqueria-chicago-2

restaurants_food_poisoning_reviews.dtypes

reviewRating      int32
datePublished    object
description      object
author           object
alias            object
dtype: object

# We check for null values in columns
restaurants_food_poisoning_reviews.columns[restaurants_food_poisoning_reviews.isnull().any()].tolist()

[]

# We check for empty strings
restaurants_food_poisoning_reviews.columns[(restaurants_food_poisoning_reviews=='').any()].tolist()

[]

# now to the restaurants info dataset 
restaurants_food_poisoning.dtypes

Address            object
License #         float64
Name               object
alias              object
id                 object
is_closed         float64
overall_rating    float64
price              object
review_count      float64
dtype: object

# We check for null values in columns
restaurants_food_poisoning.columns[restaurants_food_poisoning.isnull().any()].tolist()

['price']

# To deal with the missing values in price we later plot a distribution over the price variable and then decide what do we set it

# We check for empty strings
restaurants_food_poisoning.columns[(restaurants_food_poisoning=='').any()].tolist()

[]

# The id column was only useful for the yelp part, aliases are already a unique identifier and easier to read so we get rid of it
restaurants_food_poisoning = restaurants_food_poisoning.drop(['id'],axis = 1)

# replace dollar sign to get data that can be parsed because dollar signs cause problems in strings
restaurants_food_poisoning = restaurants_food_poisoning.astype({'price': 'str'})
restaurants_food_poisoning['price'] = restaurants_food_poisoning['price'].str.replace('$', '\$')

3. Understanding what’s in the data

a. Distributions

# We get the distribution of reviews over the years
restaurants_food_poisoning_reviews['year_of_review'] = restaurants_food_poisoning_reviews.datePublished.map(lambda x : int(x[:4]))

# Plotting the distribution of the reviews over the years
plt.figure(figsize=(10, 5))
p = sns.countplot(data = restaurants_food_poisoning_reviews, y = 'year_of_review', orient="h").invert_yaxis()
plt.title("Distribution of the reviews over the years",fontsize=15)
plt.xlabel('Reviews count',fontsize=15)
plt.ylabel('Year of the reviews publishment',fontsize=15);

png

# We analyse bad ratings of reviews by getting the restaurants with most bad rating scores. As we work on a sample
# of the data, reviews counts may be lower for some restaurants.
ratings = restaurants_food_poisoning_reviews[restaurants_food_poisoning_reviews.reviewRating == 1].groupby(['alias', 'reviewRating']).agg('size').to_frame('count').reset_index().sort_values(by='count', ascending=False)
ratings[:20]

	alias	reviewRating	count
40	fremont-chicago	1	40
12	bbq-supply-chicago-3	1	40
55	lao-sze-chuan-uptown-chicago	1	40
88	river-roast-chicago-3	1	40
34	dolo-restaurant-and-bar-chicago-3	1	40
35	duseks-board-and-beer-chicago-2	1	40
18	borinquen-restaurant-chicago-3	1	40
94	sonic-drive-in-chicago	1	40
15	blaze-n-grill-chicago	1	40
84	picante-taqueria-chicago-2	1	40
25	chengdu-impression-chicago	1	40
43	ghareeb-nawaz-chicago	1	40
65	marianos-fresh-market-chicago-10	1	40
8	american-junkie-chicago	1	40
64	marianos-chicago-3	1	40
49	ja-grill-hyde-park-chicago	1	40
63	marianos-chicago-13	1	40
106	the-halal-guys-chicago	1	40
62	margaritaville-chicago-chicago	1	40
48	italian-express-chicago	1	32

plt.figure(figsize=(10, 5))
#p = sns.barplot(x ='alias', y = 'count', data = ratings[:10], orient="h").invert_yaxis();
p = sns.barplot(data = ratings[:20], x='count', y = 'alias', orient="h")
plt.title("Names of business with the most One Star Reviews",fontsize=15)
plt.xlabel('count of One Star reviews',fontsize=15)
plt.ylabel('Alias of restaurant',fontsize=15);

png

This plot shows the names of business from the Yelp reviews sample with the most One Star Reviews. Since we are handling a sample, we unfortunately didn’t get a clearer ranking in terms of number of One Star Reviews since the number of reviews we have so far is limited. This plot would be more significant when we will work with the entire dataset.

# Now for the restaurants info dataset
# Since the price column contains dollars, we are going to change dollar signs to integers
restaurants_food_poisoning['price'].unique()

array(['$', nan, '$$', '$$$', '$$$$'], dtype=object)

# Plotting the distribution of the price range by business
dat = restaurants_food_poisoning.groupby(['price']).agg('size').to_frame('count').reset_index()
ax = sns.barplot(x="price",y="count", data=dat)
plt.title("Distribution of the price range by business",fontsize=15)
plt.xlabel('Price range',fontsize=15)
plt.ylabel('Businesses_count',fontsize=15);

png

Here we can see from our sample that the data is right skewed , the restaurants that fail food inspections for food poisoning are more likely to be cheap

# We replace the nans of the price range with the most reprensented value $$
# this can change when we get the whole dataset 
restaurants_food_poisoning['price'] = restaurants_food_poisoning['price'].replace('nan', '\$\$', regex=True)

# Plotting the distribution of the overall_ratings by business
dat = restaurants_food_poisoning.groupby(['overall_rating']).agg('size').to_frame('count').reset_index()
ax = sns.barplot(x="overall_rating",y="count", data=dat)
plt.title("Distribution of the overall ratings by business",fontsize=15)
plt.xlabel('Overall ratings',fontsize=15)
plt.ylabel('Businesses_count',fontsize=15);

png

We can see here that the ratings are left skewed, so restaurants still have good grades despite having had food poisoning

# Plotting the distribution of the overall_ratings by business
dat = restaurants_food_poisoning.groupby(['is_closed']).agg('size').to_frame('count').reset_index()
ax = sns.barplot(x="is_closed",y="count", data=dat)
plt.title("Distribution of the permanently closed businesses",fontsize=15)
plt.xlabel('is_closed ',fontsize=15)
plt.ylabel('Businesses_count',fontsize=15);

png

Here 0 means that the restaurant is currently opened and 1 means that the restaurant is permanently closed, we can clearly see from this sample that the restaurants that faced food inspections and failed for food poisoning have a high chance of being closed permanently today

# Plotting the distribution of the overall number of reviews by business
plt.figure(figsize=(4, 8))
sns.boxplot(restaurants_food_poisoning.review_count, 
 orient='v', color='#F6A6A0')
plt.ylabel('Number of reviews per busniness')
plt.title("Distribution of the overall number of reviews per business");

png

Here we see that most of the restaurants have between 50 and 200 reviews

# We plot the top 10 businesses by number of overall reviews
restaurants_food_poisoning.sort_values(by='review_count', ascending=False)

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-1-d4d92bb30ce6> in <module>
      1 # We plot the top 10 businesses by number of overall reviews
----> 2 restaurants_food_poisoning.sort_values(by='review_count', ascending=False)


NameError: name 'restaurants_food_poisoning' is not defined

b. Correlations

# Correlation between variables in the restaurants dataset
restaurants_food_poisoning[['is_closed','overall_rating','review_count']].corr()

	is_closed	overall_rating	review_count
is_closed	1.000000	-0.221065	-0.027791
overall_rating	-0.221065	1.000000	0.375824
review_count	-0.027791	0.375824	1.000000

# Plotting the heatmap.
plt.figure(figsize=(5,5))
ax = sns.heatmap(restaurants_food_poisoning[['is_closed','overall_rating','review_count']].corr(), annot=True, fmt=".2f", cmap = "coolwarm")

# This is a small fix to have the heatmap not be cropped on the top and bottom parts.
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
ax.set_title('Heatmap of the correlation between variables in the restaurants dataset.')

ax;

png

Here we see that the correlation between variables stays relatively low, even though we can see that the higher the review_count is the higher the overall_rating for that restaurant is which makes sense since popular restaurants normally are popular for a “reason” so they may serve at least decent food. We also see that the restaurant that the fact that a restaurant has bad overall rating affects if a restaurant is closed or not.

4. Enriching, filtering, transforming the data according to your needs.

a. Enriching the data

- Detecting the language of reviews

# We detect the language of the reviews for the coming text analysis
from langdetect import detect
restaurants_food_poisoning_reviews["language"] = restaurants_food_poisoning_reviews.description.map(lambda x: detect(x))

# We see that there are to languages, English and Spanish, we delve further to see what comments were actually said in spanish
restaurants_food_poisoning_reviews["language"].unique()

array(['en', 'es'], dtype=object)

restaurants_food_poisoning_reviews[restaurants_food_poisoning_reviews["language"] == 'es']['description'].iloc[0]

"Yesterday I went to eat here and the food is excellent I like it but it is unfortunate that they have a manager named CORNELIUS who is racist, I do not let myself eat in the place and I call the police, maybe I have a complex with Hispanics but I do not  blamed the place, maybe they don't even know the serious problems CORNELIUS has inside\n\nayer fui a comer aqui y la comida es excelente me gusta pero es lamentable que tengan un manager de nombre CORNELIUS que es racista, no me dejo comer en el lugar y me llamo a la policia, quiza tenga algun complejo con los hispanos pero no le hecho la culpa al local, quiza no saben aun los graves problemas que tiene CORNELIUS en su interior"

# We see that we only have one review in spanish for this sample, thus we just choose to ignore it for now when we do further text processing
# For the next milestone, when we have complete data, if we see that the numbear of reviews in languages other than english is significant
# we will treat them accordingly, by doing a specific preprocessing to each language according to their respective stopwords and 
# study the relationship between words in all languages.
# For now we focus on the english language since it is difficult to leverage useful data from only one review

restaurants_food_poisoining = restaurants_food_poisoning_reviews[restaurants_food_poisoning_reviews["language"] == 'en']

b. Transforming the text in reviews to remove noise

# Introduce a preprocessing function to get rid of stopwords in the description of reviews
def preprocess(x):
    x = re.sub('[^a-z\s]', '', x.lower())                  # get rid of noise
    x = [w for w in x.split() if w not in set(stopwords)]  # remove stopwords
    return ' '.join(x) # then join the text again

#Constructing a list of stopwords

#download all stopwords from the nltk library
nltk.download('stopwords')

#get a list of all english stopwords
i = nltk.corpus.stopwords.words('english')

#get a list of all punctuations
j = list(string.punctuation)

stopwords = set(i).union(j)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KaabachiBayrem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# filter the column description according to the stopwords 
restaurants_food_poisoning_reviews["description_filtered"]= restaurants_food_poisoning_reviews.apply(lambda row : preprocess(row["description"]) , axis = 1)
restaurants_food_poisoning

	reviewRating	datePublished	description	author	alias	year_of_review	language	description_filtered
0	1	2019-09-06	Grabbed a takeout order tonight on a Friday, p...	Paul S.	picante-taqueria-chicago-2	2019	en	grabbed takeout order tonight friday place dea...
1	1	2019-08-13	I almost passed out eating here when I found a...	Mariam R.	picante-taqueria-chicago-2	2019	en	almost passed eating found bug inside steak ta...
2	1	2019-07-28	First time we ever have gone to Picante and wa...	Maria Guadalupe V.	picante-taqueria-chicago-2	2019	en	first time ever gone picante disappointed mind...
3	1	2019-05-31	Bland. Customer service needs work. Offensive ...	Dana H.	picante-taqueria-chicago-2	2019	en	bland customer service needs work offensive pa...
4	1	2019-04-22	Steak burrito ruined my day, stay away! Not t...	Daniel C.	picante-taqueria-chicago-2	2019	en	steak burrito ruined day stay away worst food ...
...	...	...	...	...	...	...	...	...
455	2	2015-07-29	Service and quality of the food has definitel...	tracy h.	borinquen-lounge-chicago	2015	en	service quality food definitely gone used one ...
456	2	2015-01-28	Mediocre at best. Had a soup, included bones.....	Jon W.	borinquen-lounge-chicago	2015	en	mediocre best soup included bones broth bland ...
457	2	2014-06-30	A bit disappointing, I used to love this plac...	Elsi G.	borinquen-lounge-chicago	2014	en	bit disappointing used love place finally deci...
458	2	2014-06-28	The 2 starts are for the service but the food ...	Linda A.	borinquen-lounge-chicago	2014	en	starts service food good jibarito cooked super...
459	2	2013-10-13	I guess sometimes, yelp gets it wrong. This pl...	Nathaniel C.	borinquen-lounge-chicago	2013	en	guess sometimes yelp gets wrong place subpar f...

4130 rows × 8 columns

4. Deep dive analysis of two businesses that failed food inspections for food poisoning

a. BBQ Supply Co restaurant

# Get all the reviews of BBQ Supply CO restaurant
bbq_reviews  = restaurants_food_poisoning_reviews[restaurants_food_poisoning_reviews.alias == 'bbq-supply-chicago-3']
bbq_reviews.head()

	reviewRating	datePublished	description	author	alias	year_of_review	language	description_filtered
1575	1	2019-09-30	I asked for some extra chips and the guy bring...	JP H.	bbq-supply-chicago-3	2019	en	asked extra chips guy bringing back eating lol...
1576	1	2018-04-29	Tried to order from here. They cancelled my or...	Kim M.	bbq-supply-chicago-3	2018	en	tried order cancelled order store reason calle...
1577	1	2017-12-12	I was hoping to like this place and had a stro...	Tashia A.	bbq-supply-chicago-3	2017	en	hoping like place strong craving bbq days seve...
1578	1	2017-10-21	Well I really had high hopes for this place. ...	Joshua W.	bbq-supply-chicago-3	2017	en	well really high hopes place could give zero s...
1579	1	2017-10-13	Everything was way too smoky...chips...everyth...	Gina L.	bbq-supply-chicago-3	2017	en	everything way smokychipseverything absolutely...

We make a word cloud of BBQ Supply Co restaurant reviews to have an idea about most used words over the reviews

wc = WordCloud(background_color="white",width=1600, height=800, random_state=1, max_words=200)
# Join all strings together from the filtered description column to obtain all review text for the restaurant
wc.generate(' '.join(bbq_reviews['description_filtered']))


plt.figure(figsize=(15,10))
# Add title to figure
plt.title("BBQ Supply Co restaurant reviews", fontsize=40)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout(pad=10)

png

# We get the 20 most occuring words over the reviews. 

# Pass the split_it list to instance of Counter class. 
splitted = bbq_reviews['description_filtered'].apply(lambda x: x.split()).to_list()
flatten = [item for sublist in splitted for item in sublist]
counter = Counter(flatten) 
  
# most_common() produces k frequently encountered 
# input values and their respective counts. 
most_occur = counter.most_common(20) 
most_occur

[('bbq', 40),
 ('food', 36),
 ('place', 32),
 ('good', 28),
 ('like', 28),
 ('order', 23),
 ('cheese', 23),
 ('brisket', 22),
 ('pulled', 19),
 ('even', 18),
 ('pork', 17),
 ('ribs', 17),
 ('back', 16),
 ('mac', 16),
 ('chips', 15),
 ('one', 15),
 ('time', 15),
 ('eat', 14),
 ('would', 14),
 ('ive', 14)]

Conclusion ==>

When inspecting the word cloud, we can see some negative words like bad, wrong and disappointed. The 20 most occuring words show us no sign of negative reviews nor food poisoning.

In order to have a sharper view on bad reviews, we make a sentiment analysis over reviews.

Sentiment analysis is the process of computationally identifying and categorizing opinions expressed in a text. Especially we apply this process on our reviews in order to determine whether the writer’s attitude towards a restaurant’s service is positive, negative, or neutral.

#Now we add a column which has the sentiment values of the each review posted for BBQ Supply Co, to see how the 
#author of this review feels about the restaurant.

#Create Text Blob Object and draw out the sentiment of each review by returning a value between -1.0 (extreme negative) 
#and 1.0 (extreme positive).
bbq_reviews['sentiment_analysis'] = bbq_reviews['description_filtered'].apply(lambda x: TextBlob(x).sentiment.polarity)
bbq_reviews = bbq_reviews.sort_values(by='sentiment_analysis', ascending=True)
bbq_reviews[:10]

C:\Users\KaabachiBayrem\.conda\envs\ada\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

	reviewRating	datePublished	description	author	alias	year_of_review	language	description_filtered	sentiment_analysis
1601	1	2015-02-20	I just received delivery of a pulled pork sand...	k h.	bbq-supply-chicago-3	2015	en	received delivery pulled pork sandwich mac n c...	-1.000000
1576	1	2018-04-29	Tried to order from here. They cancelled my or...	Kim M.	bbq-supply-chicago-3	2018	en	tried order cancelled order store reason calle...	-0.800000
1594	1	2016-01-03	Literally the worst bbq and worst service at a...	Corie Z.	bbq-supply-chicago-3	2016	en	literally worst bbq worst service restaurant i...	-0.333333
1596	1	2015-06-22	Fatty Brisket with no flavor. St. Louis cut po...	Li S.	bbq-supply-chicago-3	2015	en	fatty brisket flavor st louis cut poor flavor ...	-0.194444
1612	1	2013-06-06	Worst BBQ I've ever had. For a barbecue joint...	Paco R.	bbq-supply-chicago-3	2013	en	worst bbq ive ever barbecue joint st louis sty...	-0.172153
1593	1	2016-04-29	Smelled delish when we walked in the door. But...	Nancy T.	bbq-supply-chicago-3	2016	en	smelled delish walked door went hill sat back ...	-0.171429
1591	1	2016-06-01	My brother and I came here last month and it w...	Charisse B.	bbq-supply-chicago-3	2016	en	brother came last month extremely disappointin...	-0.161966
1603	1	2014-12-19	The stale vinegar odor got me first, before I ...	Barbara N.	bbq-supply-chicago-3	2014	en	stale vinegar odor got first even opened styro...	-0.090833
1585	1	2017-04-06	I'll give BBQ Supply at least one star since I...	Nigel P. C.	bbq-supply-chicago-3	2017	en	ill give bbq supply least one star since cant ...	-0.074306
1588	1	2017-03-31	I used to love this place when it was Rubs, at...	Holly T.	bbq-supply-chicago-3	2017	en	used love place rubs ate many times took cooki...	-0.068366

Here is the first top 5 most negative reviews to BBQ Supply Co restaurant :

o I just received delivery of a pulled pork sandwich and Mac 'n' cheese ... One minute after eating it I promptly vomited. All I could taste was liquid smoke.  I threw the rest of my lunch. Disgusting.

o Tried to order from here. They cancelled my order in store for no reason. I called and they said they could re-take it but the money I paid for with my card is already gone, waiting 3-5 business days for a refund, that's stupid.

o Literally the worst bbq and worst service at a restaurant I\'ve ever had. Let me start by saying I love smoked bbq. I\'ve eaten at dozens of bbq restaurants over the country and have smoked my own meat as well. I\'m no expert at bbq but I definitely enjoying eating it. However, the food here tasted like cigarette smoke - not bbq smoke. I couldn\'t swallow or even stand the smell of this ashtray food. The worst part of the whole experience is that when I complained to the server about the food she said "there\'s nothing wrong with that food". I told her it\'s awful and won\'t eat it. She said I didn\'t have to eat it but I would still have to pay for it. After arguing with the manager on duty, they refused to change my bill or offer me something else to eat. I reluctantly paid the bill for food I didn\'t eat (left it on the table with only about 4 bites taken) and walked out hungry. I\'ve never experienced such ignorance from a business. Definitely will not return.

o Fatty Brisket with no flavor. St. Louis cut poor flavor. What do they have against salt. Over priced, over rated and typical White folks BBQ. If Bubba ain't on the pit or seasoning the  meat then Bye. If anyone in the Kitchen have no roots past Virginia then say hello to average mac, corn bread and any thing else that goes well with BBQ. Another rip off

o Worst BBQ I've ever had.  For a barbecue joint having only St. Louis Style Ribs is a major oversight.  Baby Backs should at least be an option.  I could even forgive the lack of Baby Back Ribs option if the overpriced St. Louis Style Ribs were actually good but they aren't.  I ordered a full slab and was met with ribs that were tasteless and tough as tire rubber.  Bread on the side, which in all other rib shacks equals a slice of white bread, consisted of a sandwich bun.  Seriously?!  Don't look for Fries or anything else needing a deep fryer on the menu either because they lack a deep fryer.  My wife was disappointed in her dry pulled pork sandwich as well.  The BBQ sauces were locked tighter than Fort Knox in their containers and nearly impossible to extricate.  Presentation was poor as well.  The full slab was cut up into individual ribs and placed into a paper holding tray.  No where on the metal tray was there anywhere to discard the rib bones.  They teach cooking classes at Rub as well which shouldn't be allowed considering the quality of their food.  Overall, a very poor example of BBQ.  I suggest Hecky's, Uncle Willy's, or Uncle John's if you want quality ribs in Chicago.

When reading the first review with -1 sentiment value, we detect a food poisoning sign: ‘One minute after eating it I promptly vomited. All I could taste was liquid smoke’.

b. Margaritaville restaurant

# Get all the reviews of BBQ Supply CO restaurant
margaritaville_reviews = restaurants_food_poisoning_reviews[restaurants_food_poisoning_reviews.alias == 'margaritaville-chicago-chicago']
margaritaville_reviews.head()

	reviewRating	datePublished	description	author	alias	year_of_review	language	description_filtered
420	1	2019-11-08	The people who work here do not want customers...	Ann F.	margaritaville-chicago-chicago	2019	en	people work want customers sign read gone fish...
421	1	2019-11-08	Wow! Just walked in. Not busy at all! Total...	Michelle F.	margaritaville-chicago-chicago	2019	en	wow walked busy totally ignored walked saw ple...
422	1	2019-11-08	We had 2 small grandchildren with us, asked wh...	Judy K.	margaritaville-chicago-chicago	2019	en	small grandchildren us asked kids menus waitre...
423	1	2019-10-23	We waited for some time and the waitress came ...	Thomas L.	margaritaville-chicago-chicago	2019	en	waited time waitress came said would serve us ...
424	1	2019-09-28	So I sent an email out about my horrible exper...	Jacqueline R.	margaritaville-chicago-chicago	2019	en	sent email horrible experience lady first seem...

wc = WordCloud(background_color="white",width=1600, height=800, random_state=1, max_words=200)
# Join all strings together from the filtered description column to obtain all review text for the restaurant
wc.generate(' '.join(margaritaville_reviews['description_filtered']))


plt.figure(figsize=(15,10))
# Add title to figure
plt.title("Margaritaville restaurant reviews", fontsize=40)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout(pad=10)

png

# We get the 20 most occuring words over the reviews. 

# Pass the split_it list to instance of Counter class. 
splitted = margaritaville_reviews['description_filtered'].apply(lambda x: x.split()).to_list()
flatten = [item for sublist in splitted for item in sublist]
counter = Counter(flatten) 
  
# most_common() produces k frequently encountered 
# input values and their respective counts. 
most_occur = counter.most_common(20) 
most_occur

[('us', 35),
 ('food', 33),
 ('service', 22),
 ('would', 20),
 ('minutes', 19),
 ('place', 17),
 ('came', 17),
 ('like', 15),
 ('people', 14),
 ('time', 14),
 ('server', 14),
 ('table', 13),
 ('waited', 13),
 ('back', 12),
 ('asked', 11),
 ('waitress', 11),
 ('go', 11),
 ('one', 11),
 ('dont', 11),
 ('min', 11)]

When inspecting the word cloud, we can see some negative words like bad, awful and slow. The 20 most occuring words show us no sign of negative reviews nor food poisoning.

In order to have a sharper view on bad reviews, we make a sentiment analysis over reviews.

#Now we add a column which has the sentiment values of the each review posted for Margaritaville, to see how the 
#author of this review feels about the restaurant.

#Create Text Blob Object and draw out the sentiment of each review by returning a value between -1.0 (extreme negative) 
#and 1.0 (extreme positive).
margaritaville_reviews['sentiment_analysis'] = margaritaville_reviews['description_filtered'].apply(lambda x: TextBlob(x).sentiment.polarity)
margaritaville_reviews = margaritaville_reviews.sort_values(by='sentiment_analysis', ascending=True)
margaritaville_reviews[:10]

C:\Users\KaabachiBayrem\.conda\envs\ada\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

	reviewRating	datePublished	description	author	alias	year_of_review	language	description_filtered	sentiment_analysis
449	1	2018-11-06	Had a horrible experience here. Waited a hour ...	Ryan D.	margaritaville-chicago-chicago	2018	en	horrible experience waited hour disgusting san...	-0.866667
432	1	2019-07-11	Don't go here. The hostess has a terrible att...	Joey B.	margaritaville-chicago-chicago	2019	en	dont go hostess terrible attitude walked walke...	-0.456250
440	1	2019-05-26	Horrible and slow service...waited over 15 min...	Nisha L.	margaritaville-chicago-chicago	2019	en	horrible slow servicewaited minutes waitress e...	-0.433333
435	1	2019-06-20	Absolutely horrible service. Must have caught ...	Jeff B.	margaritaville-chicago-chicago	2019	en	absolutely horrible service must caught shift ...	-0.313462
447	1	2018-12-27	The food was cold, the service was awful and w...	Carlos T.	margaritaville-chicago-chicago	2018	en	food cold service awful constantly ask water n...	-0.295000
423	1	2019-10-23	We waited for some time and the waitress came ...	Thomas L.	margaritaville-chicago-chicago	2019	en	waited time waitress came said would serve us ...	-0.275000
445	1	2019-01-21	Our waitress was awful. She had a passive aggr...	Katelynn B.	margaritaville-chicago-chicago	2019	en	waitress awful passive aggressive attitude who...	-0.237500
458	1	2018-07-07	Their hosts were completely confused when seat...	Alyssa M.	margaritaville-chicago-chicago	2018	en	hosts completely confused seating us called us...	-0.213333
453	1	2018-08-17	Overpriced garbage food! Bad service, long ser...	S D.	margaritaville-chicago-chicago	2018	en	overpriced garbage food bad service long servi...	-0.203750
451	1	2018-10-28	Nice waitress. Absolutely awful food: tasteles...	jenn H.	margaritaville-chicago-chicago	2018	en	nice waitress absolutely awful food tasteless ...	-0.180000

Let’s look at those reviews and detect those related to food poisoning

o Grabbed a takeout order tonight on a Friday, place was dead and every employee was a joke. Took 20 mins for 2 veggie tacos and employees were dying laughing about how they unplugged the GrubHub device and all the GrubHub orders we’re calling the phone to get refunds and they would hang up and cuss them out. Food was subpar. 3 people working on Friday night for a place that can probably seat 50 people. Unsurprisingly they had 4 customers. Shame on me for even trying the place.

o I almost passed out eating here when I found a bug INSIDE my steak taco … the chicken tacos were really dry and bland. The steak was good but omg the fact that something was crawling inside it.

o Bland. Customer service needs work. Offensive patrons. My chips and salsa was served in a ziplock bag . A generic one…

o Steak burrito ruined my day, stay away! Not the worst food poisoning I’ve ever had but pretty bad. Wouldn’t come back ever.

o Useless. This place used to be great. Now they don’t answer their phone, their message box is full, and their employees don’t know how to operate the credit card machine. Why bother…

o I literally live right across the street and I would rather get in my car and go get tacos or not eat them at all. Way over priced for what you get and they aren’t even good. I wouldn’t even pay 1 dollar for them let alone almost 4 dollars. Insane.

o As I am sitting here disgusted by what I jut consumed. The order of nachos without meat, intentionally I did not order meat to avoid the extra heavy feeling after eating. My order consisted of a massive order of chips, black beans, salsa, shredded cheese, a corner of sour cream & sliced avocado per my request & jalapeños. Flavorless is the biggest disappointment, beans no flavor, Salsa no flavor. I would not have enjoyed this if I was hung over. I came here per recommendation from a business down the street that was servicing my car. Never again. As people were walking in to order food, I wanted to ask, “what are you eating here & what does it taste like?”. I was actually headed to The Girl & Goat Diner, but came here out of convenience. I know next time.

Conclusion :

For further work we will construct our own dictionary of words related to food poisoning, that allow us easily pinpoint reviews that may show us signs of food poisoning in a restaurant.

Informed Plan

Food Chain investigation

Are food chains in different areas more prone to food inspections failure than the same food chains in other areas ?

To answer this question, we will analyze the geoographical distribution of risk, types of inspection, inpection result and violation type. We have shown that this is possible to do as we did it for the entire data and we want to do this now for the specific case of food chains in order to conclude whether these characteristics change depending on location within the same food chain.

Chicago Tribune article

Can we assess how real Chicago Tribune’s critics are ?

To answer this question, first we will seperate the establishments by risk category and then we will observe the frequency of inspections and how the inspection’s result impact the next inspection.Secondly, we will look at whether there has been a notable change in the inspection behavior past the article publication’s date. The article asks an important question “Does Chicago need more inspectors or more realisitc rules ?”. Therefore, we will try to answer this question by comparing the impact of the frequency of the previous investigation and the previous violation assigned on the next result. For the inspections’ frequency we can see by how much the probabilty of a complaint or food poisonning investigation changes with the time interval between inspections. For the “rules” criteria we can look at the most commun reported violations in passed inspections before a business fails and assess how critical their impact is.

Food chains

food[food["DBA Name"]=="subway"]["AKA Name"].unique()

array(['subway'], dtype=object)

food[food["AKA Name"]=="subway"]["DBA Name"].unique()

array(['subway', 'subway sandwiches and salads', 'subway store  # 25458',
       'subway sandwiches & salads', 'subway  5183', 'subway 14641',
       'the great state fare', 'c-k subway', 'subway 26199',
       'subway-sandwich', 'subway #44541', 'subway sandwich & salads',
       'sub way', 'sandwich & salad', 'subway sandwiches',
       'subway sandwich shop', 'subway restaurant',
       'subway sandwich & salad', 'subway # 26817-309 w. monroe',
       'subway / tcby', 'subway 7709', 'subway 1121', 'subway #25073',
       'subway 3634', 'subway store #30969', 'subway sandwich and salads',
       'subway 10266', 'skyhigh-101', 'subway sanwich shop',
       'subway #51581', 'subway  #38326', 'subway subs and salads',
       'subway sandwiches and salad', 'subway 50886',
       'subway sandwiches & salad', 'subway restaurant #35406',
       'subway 25064', 'lincoln ave subway', 'subway 275',
       'subway at norwegian american hospital', 'subway #4267',
       'subway sandwich', 'subway 24379', 'subway # 26771',
       'subway #65029', 'tng enterprises, inc', 'subway / jay shree inc',
       'suwbay', 'subway # 7659', 'wzumba inc', 'subway #24879'],
      dtype=object)

food[food["AKA Name"]=="mcdonalds"]["DBA Name"].unique()

array(['mcdonalds', 'mcdonalds restaurant', 'pma mcd inc', "mcdonald's",
       'mcdonalds restaurant # 14173', "hale family mcdonald's",
       'mcdonalds #335'], dtype=object)

food[food["DBA Name"]=="mcdonalds"]["AKA Name"].unique()

array(['mcdonalds', 'mc donalds', "mcdonald's", 'mcdonalds  '],
      dtype=object)

From these examples we can see that it is easier to identify food chains’ establishments by their AKA Name

# remove punctuation
food["DBA Name"]=food["DBA Name"].str.replace(r'[^\w\s]+', '')
food["AKA Name"]=food["AKA Name"].str.replace(r'[^\w\s]+', '')
# One of our goals is to investigate food chains, so we'll try to group them according to their AKA Name
# Hardcode formatting for food chains
food["AKA Name"]=food["AKA Name"].str.replace("kentucky fried chicken", 'kfc')
food["AKA Name"]=food["AKA Name"].str.replace("mc donalds","mcdonalds")
food["AKA Name"]=food["AKA Name"].str.replace("mcdonalds restaurant","mcdonalds")
food["AKA Name"]=food["AKA Name"].str.replace("dominos pizza","dominos")
food["AKA Name"]=food["AKA Name"].str.replace("subway sandwiches","subway")
food["AKA Name"]=food["AKA Name"].str.replace("chipotle mexican grill","chipotle")
food["AKA Name"]=food["AKA Name"].str.replace("potbelly sandwich works", "potbelly")
food["AKA Name"]=food["AKA Name"].str.replace("potbelly sandwich works llc", "potbelly")
food["AKA Name"]=food["AKA Name"].str.replace("whole foods market","whole foods")
food["AKA Name"]=food["AKA Name"].str.replace("7eleven","7 eleven")
food["AKA Name"]=food["AKA Name"].str.replace("j  j fish","jj fish  chicken")
food["AKA Name"]=food["AKA Name"].str.replace("quiznos sub","quiznos")
food["AKA Name"]=food["AKA Name"].str.replace("starbucks coffee","starbucks")
food["AKA Name"]=food["AKA Name"].str.replace("donutsbaskin","donuts baskin")


food["AKA Name"]=food["AKA Name"].str.title()
food["AKA Name"]=food["AKA Name"].str.replace(" ","")
food["DBA Name"]=food["DBA Name"].str.strip()

We can’t talk about food in Chicago without mentionning their deep dish pizza, and namely two of their most famous venues: Giordano’s and Lou Malnati’s . So let’s add them

food[["Malnati" in aka for aka in food["AKA Name"]]]["AKA Name"].unique()

array(['LouMalnatisPizzeria', 'LouMalnatis'], dtype=object)

food["AKA Name"]=food["AKA Name"].str.replace('LouMalnatisPizzeria', 'LouMalnatis')

giordanos=food[["Giordano" in aka for aka in food["AKA Name"]]].index
food.loc[giordanos,"AKA Name"]="Giordanos"

# We look at establishments with the same AKA name and different adresses
food_chains=food.groupby("AKA Name").Address.nunique().sort_values(ascending=False)
# We will retain food chains with 10 or more establishments
food_chains=food_chains[food_chains>10]
#food_chains[food_chains<10].head(20)
food_chains

AKA Name
Subway                       280
DunkinDonuts                 159
7Eleven                      110
Mcdonalds                    108
Starbucks                    101
Citgo                         72
Cvs                           69
DunkinDonutsBaskinRobbins     69
Walgreens                     42
JimmyJohns                    42
BurgerKing                    42
Marathon                      41
Bp                            34
DollarTree                    33
Chipotle                      33
Kfc                           31
GatewayNewstand               31
Dominos                       30
FamilyDollar                  29
Potbelly                      26
Wendys                        26
PizzaHut                      26
HaroldsChicken                26
TacoBell                      26
Freshii                       23
Wingstop                      22
AuBonPain                     21
Shell                         21
Giordanos                     21
JjFishChicken                 21
HaroldsChickenShack           20
Popeyes                       20
LaMichoacana                  18
SharksFishChicken             18
Mobil                         18
CornerBakeryCafe              18
JambaJuice                    16
PotbellyLlc                   15
DollarGeneral                 14
WholeFoods                    14
Herbalife                     14
Quiznos                       14
PapaJohnsPizza                14
Target                        13
SeeThruChineseKitchen         12
LouMalnatis                   12
PretAManger                   12
PeetsCoffeeTea                12
ArgoTea                       12
ProteinBar                    11
Pockets                       11
HalstedStreetDeli             11
Name: Address, dtype: int64

from sklearn.metrics.pairwise import euclidean_distances
# We are interested in food chains that are rather spread out across the city, for that we will assess the pairwise
# distances of establishments within the same food chain
def food_chain_spread(food_chain):
    location=food[food["AKA Name"]==food_chain][["Latitude","Longitude"]]
    dist_mat=euclidean_distances(location,location)
    return np.sum(dist_mat)/(len(location))
food_chain_spread("Subway")

436.22815124500346

spread=pd.Series(food_chains.index).apply(lambda x: food_chain_spread(x))

fc=pd.DataFrame({"Food_chain":food_chains.index,"number_of_establishments":food_chains.values,"spread":spread})

fc.sort_values("spread", ascending=False).head(10)

	Food_chain	number_of_establishments	spread
0	Subway	280	436.228151
3	Mcdonalds	108	282.205929
1	DunkinDonuts	159	156.132113
2	7Eleven	110	100.732759
4	Starbucks	101	99.936245
7	DunkinDonutsBaskinRobbins	69	84.836581
10	BurgerKing	42	50.560232
20	Wendys	26	45.364857
15	Kfc	31	44.239157
14	Chipotle	33	40.600311

Unsuprisingly, fast food chains dominate the list