# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#remove warnings
import warnings
warnings.filterwarnings('ignore')


# read the data
df = pd.read_csv('foodhub_order.csv')
# returns the first 5 rows
df.head()


df.shape

(1898, 9)


# use info() to print a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB


# coverting "objects" to "category" reduces the data space required to store the dataframe
# write the code to convert 'restaurant_name', 'cuisine_type', 'day_of_the_week' into categorical data
df["restaurant_name"]=df["restaurant_name"].astype("category")
df["cuisine_type"]=df["cuisine_type"].astype("category")
df["day_of_the_week"]=df["day_of_the_week"].astype("category")
# use info() to print a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   order_id               1898 non-null   int64   
 1   customer_id            1898 non-null   int64   
 2   restaurant_name        1898 non-null   category
 3   cuisine_type           1898 non-null   category
 4   cost_of_the_order      1898 non-null   float64 
 5   day_of_the_week        1898 non-null   category
 6   rating                 1898 non-null   object  
 7   food_preparation_time  1898 non-null   int64   
 8   delivery_time          1898 non-null   int64   
dtypes: category(3), float64(1), int64(4), object(1)
memory usage: 102.7+ KB


df.describe()


sns.distplot(df["cost_of_the_order"]);


sns.distplot(df["food_preparation_time"]);


sns.distplot(df["delivery_time"]);


df["rating"]=df["rating"].replace('Not given', np.NaN)
#replacing all "Not given" ratings with "NaN" to make it recognizable to missing value command.
df["rating"]

0       NaN
1       NaN
2         5
3         3
4         4
       ... 
1893      5
1894      5
1895    NaN
1896      5
1897    NaN
Name: rating, Length: 1898, dtype: object


df["rating"]=df["rating"].astype(float)
#converting categorical variable of rating to float data type to make recognizable for missing value command.
df["rating"]

0       NaN
1       NaN
2       5.0
3       3.0
4       4.0
       ... 
1893    5.0
1894    5.0
1895    NaN
1896    5.0
1897    NaN
Name: rating, Length: 1898, dtype: float64


df["rating"].isna().sum()

736


sns.countplot(y="restaurant_name",data=df, order=df["restaurant_name"].value_counts().iloc[:10].index);


#### Observations:
#The top ten restaurants with the most orders include: 
#Shake Shack, the Meatball Shop, Blue Ribbon Sushi, Blue Ribbon Fried Chicken, Parm, RedFarm Broadway, 
#RedFarm Hudson, TAO, Han Dynasty, and Blue Ribbon Sushi Bar & Grill.


sns.countplot(y="cuisine_type",data=df,order=(df["cuisine_type"].value_counts().index));


#Observation: The top 5 most ordered types of cuisine are: American, Japanese, Italian, Chinese, and Mexican.


sns.distplot(df["cost_of_the_order"], hist=True);


###Observation: 
#With a right-skewed distribution, a greater number of orders cost less than the mean of $16.50 
#than orders that cost more than the mean.


sns.boxplot(df["cost_of_the_order"]);


sns.countplot(y="day_of_the_week", data=df);


df.day_of_the_week.value_counts()

Weekend    1351
Weekday     547
Name: day_of_the_week, dtype: int64


#Observation: 
#Most orders are placed on weekends.
#Of this sample, 1351 orders (or 71% of total orders) were placed on the weekend.


sns.countplot(y="rating", data=df);


df.rating.value_counts()

5.0    588
4.0    386
3.0    188
Name: rating, dtype: int64


#Observation: Aside from the 736 orders with no rating, the remaining 61% of orders received ratings between 3 and 5 stars. 
#588 orders received 5 stars
#386 orders received 4 stars
#188 orders received 3 stars


sns.displot(data=df,x="food_preparation_time");


#Observation:
#Most orders take between 20-21 minutes, 25-26 minutes, 30-31 minutes, and 33-34 minutes to prepare.


sns.distplot(df["delivery_time"], hist=True);


#Observation: 
#The delivery time has a high variablility and is slightly left-skewed, indicated that there are slightly more orders
#that take more time than average (~24 minutes) to arrive than orders that take less time than 24 minutes to arrive.


sns.countplot(y="restaurant_name",data=df, order=df["restaurant_name"].value_counts().iloc[:5].index);


df.restaurant_name.value_counts().iloc[:5]

Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: restaurant_name, dtype: int64


df[["cuisine_type","day_of_the_week"]].value_counts(sort=True)

cuisine_type    day_of_the_week
American        Weekend            415
Japanese        Weekend            335
Italian         Weekend            207
American        Weekday            169
Chinese         Weekend            163
Japanese        Weekday            135
Italian         Weekday             91
Mexican         Weekend             53
Chinese         Weekday             52
Indian          Weekend             49
Middle Eastern  Weekend             32
Mediterranean   Weekend             32
Mexican         Weekday             24
Indian          Weekday             24
Middle Eastern  Weekday             17
Thai            Weekend             15
Mediterranean   Weekday             14
French          Weekend             13
Korean          Weekend             11
Southern        Weekend             11
Spanish         Weekend             11
Southern        Weekday              6
French          Weekday              5
Thai            Weekday              4
Vietnamese      Weekend              4
                Weekday              3
Korean          Weekday              2
Spanish         Weekday              1
dtype: int64


np.sum(df["cost_of_the_order"]>20)

555


twenty=(df["cost_of_the_order"]>20)
twenty.value_counts(normalize=True)

False    0.707587
True     0.292413
Name: cost_of_the_order, dtype: float64


df["delivery_time"].mean()

24.161749209694417


max_cost=df["cost_of_the_order"].max()#find the maximum cost of a single order
df1=df[df["cost_of_the_order"]==max_cost]#select the row containing the maximum cost
df1 #return the maximum row details


df1 = df.dropna().reset_index(drop=True) #drop missing values
df1=df1.drop(["customer_id"], axis=1) #drop cust id as numerically meaningless
df1=df1.drop(["order_id"], axis=1) #drop cust id as numerically meaningless

sns.pairplot(df1, hue="rating");#plot bivariate relationships between all numeric varaibles


#Observation: None of the bivariate pairplot analysis (with ratings noted in color) shows significant linear patterns between:
#- delivery time & cost of the order, delivery time & food prep time, delivery time & rating
#- cost of the order & food prep time, cost of the order & rating
#- food prep time & rating


sns.heatmap(df1.corr(), annot=True);


###Observations: 
#There are no siginficant correlations between the following numerical variables: cost of the order, 
#rating, food preparation time, and delivery time.
#However, cost of the order has a slightly positive correlation with both rating and food prep time, indicating:
#as cost of the order increases, rating may increase slightly. 
#And as the time to prepare food increases, the cost of the of the order increases slightly.
#Cost of the order has a slightly negative correlation with delivery time, indicating:
#As cost of the order increases, the delivery time decreases slightly.


sns.boxplot(x="day_of_the_week", y="cost_of_the_order", data=df);


####Observation:
#The range of the cost of orders is slightly lower on weekends. The middle 50% of orders are 
#almost exactly the same cost on weekends and weekends and weekdays. Both weekend and weekday orders costs tend
#to skew above their medians (both around $14), with maximums between $30 and $36.


sns.boxplot(x="day_of_the_week", y="delivery_time", data=df);


#The time to deliver orders tends to be higher on the weekdays than the weekends.
#This is shown by the higher minimum, maximum, and IQR for weekday delivery time, 
#in comparison to the same measures for weekend order delivery times.

#The range and IQR for weekend orders is greater and has a greater spread than weekday orders, indicating that
#weekend order delivery times may have a higher variance than weekday delivery times.


sns.boxplot(x="day_of_the_week", y="food_preparation_time", data=df);


sns.boxplot(x="cost_of_the_order", y="cuisine_type", data=df);#create a boxplot of order costs for each cuisine


#Observations:
#Korean and Vietnamese food orders tend to be lower in price than other cuisines, 
#with a few outliers for both types of foods' order cost.


#Stacked barplot showing top ten restaurants and counts of ratings.
sns.countplot(y="restaurant_name",hue="rating", data=df, order=df["restaurant_name"].value_counts().iloc[:5].index);


#Observations:
#Of the Top 5 restaurants with the largest number of orders, Shake Shack has the highest number of 4 and 5 star ratings.
#The Meatball Shop also has over 50 5-star ratings.
#Blue Ribbon Sushi and Blue Ribbon Fried Chicken have a mix of 3, 4, and 5 star reviews, although total reviews are
#fewer than Shake Shack and the Meatball Shop.
#Parm has fewer total number of reviews than the other four Top 5 restaurants, and has a slightly greater number of
#4 star than 5 star reviews.
#Based on this analysis, a restaurant's popularity seems to be strongly correlated with the number of reviews they have received.
#However, without 1 and 2 star ratings and the missing ratings data, we cannot test this correlation nor assume that
#a restaurant's number of reviews causes more frequent FoodHub orders.


df_valid1=df.groupby("restaurant_name").agg({"rating":["count","mean"]}).dropna()
df_valid1 #create new dataframe aggregating rating counts and average ratings by restaurant and dropping missing
# ratings


df_valid1.isna().sum()#check that missing values are dropped

rating  count    0
        mean     0
dtype: int64


df_valid2=df_valid1.loc[(df_valid1["rating"]["count"]>50)&(df_valid1["rating"]["mean"]>4.0)]
df_valid2 #filter new dataframe for restaurants with over 50 ratings and an average rating larger than 4.0


df_25=df[df["cost_of_the_order"]>20]#create new dataframe containing orders over 20 dollars
df_25


total_cost_25=df_25["cost_of_the_order"].sum()#sum of cost of all orders over 20 dollars
revenue_25=(total_cost_25*0.25)#generate revenue FoodHub made from orders over 20 dollars
revenue_25

3688.7275


df_5=df[(df["cost_of_the_order"]<=20) & 
        (df["cost_of_the_order"]>5)]#create new dataframe containing orders between 5 and 20 dollars
df_5


total_cost_5=df_5["cost_of_the_order"].sum()#sum of cost of all orders between 5 and 20.01 dollars
revenue_5=(total_cost_5*0.15)#revenue from all orders between 5 and 20.01 dollars
revenue_5

2477.5755000000004


total_rev=revenue_25+revenue_5 #total revenue from orders over 20 dollars and orders between 5 and 20.01 dollars
total_rev

6166.303


df["total_time"]=df["delivery_time"]+df["food_preparation_time"]#create new column that adds two times together
df


df_time=df[df["total_time"]>60]#create new dataframe only containing orders with total times greater than 60
df_time


(200/1898)*100 #200 orders took over 60 minutes to prepare and deliver. There were 1898 orders total.

10.537407797681771


df_weekend=df[df["day_of_the_week"]=="Weekend"]#create new dataframe that only contains orders placed on the weekend
df_weekend["delivery_time"].mean()#for orders placed on the weekend, calculate the mean delivery time

22.4700222057735


df_weekday=df[df["day_of_the_week"]=="Weekday"]#create new dataframe that only contains orders placed on weekdays
df_weekday["delivery_time"].mean()#for orders placed on weekdays, calculate the mean delivery time

28.340036563071298

	order_id	customer_id	cost_of_the_order	food_preparation_time	delivery_time
count	1.898000e+03	1898.000000	1898.000000	1898.000000	1898.000000
mean	1.477496e+06	171168.478398	16.498851	27.371970	24.161749
std	5.480497e+02	113698.139743	7.483812	4.632481	4.972637
min	1.476547e+06	1311.000000	4.470000	20.000000	15.000000
25%	1.477021e+06	77787.750000	12.080000	23.000000	20.000000
50%	1.477496e+06	128600.000000	14.140000	27.000000	25.000000
75%	1.477970e+06	270525.000000	22.297500	31.000000	28.000000
max	1.478444e+06	405334.000000	35.410000	35.000000	33.000000

	rating
	count	mean
restaurant_name
'wichcraft	1	5.000000
12 Chairs	2	4.500000
5 Napkin Burger	2	4.000000
67 Burger	1	5.000000
Amma	2	4.500000
...	...	...
Zero Otto Nove	1	4.000000
brgr	1	3.000000
da Umberto	1	5.000000
ilili Restaurant	13	4.153846
indikitch	2	4.500000

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time	total_time
7	1477859	89574	Barbounia	Mediterranean	5.97	Weekday	3.0	33	30	63
10	1477895	143926	Big Wong Restaurant _¤¾Ñ¼	Chinese	5.92	Weekday	NaN	34	28	62
19	1477354	67487	Blue Ribbon Sushi	Japanese	16.20	Weekend	4.0	35	26	61
24	1476714	363783	Cafe Mogador	Middle Eastern	15.86	Weekday	NaN	32	29	61
54	1477760	130507	Jack's Wife Freda	Mediterranean	22.75	Weekend	3.0	35	29	64
...	...	...	...	...	...	...	...	...	...	...
1869	1476923	50199	J. G. Melon	American	19.40	Weekday	4.0	35	26	61
1873	1478148	261371	Shake Shack	American	22.31	Weekend	NaN	35	28	63
1875	1478039	292343	Amy Ruth's	Southern	12.23	Weekday	NaN	32	33	65
1880	1477466	222734	Shake Shack	American	13.97	Weekend	5.0	35	27	62
1889	1478190	94152	RedFarm Broadway	Chinese	8.68	Weekday	3.0	33	30	63

Project Python Foundations: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

Question 1: Write the code to check the shape of the dataset and write your observations based on that. (0.5 mark)¶

Observations:¶

Question 2: Write the observations based on the below output from the info() method. (0.5 mark)¶

Observations:¶

Question 3: 'restaurant_name', 'cuisine_type', 'day_of_the_week' are object types. Write the code to convert the mentioned features to 'category' and write your observations on the same. (0.5 mark)¶

Observations:¶

Question 4: Write the code to find the summary statistics and write your observations based on that. (1 mark)¶

Observations:¶

Question 5: How many orders are not rated? (0.5 mark)¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on the distributions of all the relevant variables in the dataset. (5 marks)¶

Question 7: Write the code to find the top 5 restaurants that have received the highest number of orders. (1 mark)¶

Observations:¶

Question 8: Write the code to find the most popular cuisine on weekends. (1 mark)¶

Observations:¶

Question 9: Write the code to find the number of total orders where the cost is above 20 dollars. What is the percentage of such orders in the dataset? (1 mark)¶

Observations:¶

Question 10: Write the code to find the mean delivery time based on this dataset. (1 mark)¶

Observations:¶

Question 11: Suppose the company has decided to give a free coupon of 15 dollars to the customer who has spent the maximum amount on a single order. Write the code to find the ID of the customer along with the order details. (1 mark)¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform bivariate/multivariate analysis to explore relationships between the important variables in the dataset. (7 marks)¶

Observations:¶

Question 14: Suppose the company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Write the code to find the net revenue generated on all the orders given in the dataset. (2 marks)¶

Observations:¶

Question 15: Suppose the company wants to analyze the total time required to deliver the food. Write the code to find out the percentage of orders that have more than 60 minutes of total delivery time. (2 marks)¶

Observations:¶

Question 16: Suppose the company wants to analyze the delivery time of the orders on weekdays and weekends. Write the code to find the mean delivery time on weekdays and weekends. Write your observations on the results. (2 marks)¶

Observations:¶

Conclusion and Recommendations¶

Question 17: Write the conclusions and business recommendations derived from the analysis. (3 marks)¶

Key Insights:¶