Customer Shopping Analysis
Introduction
This is my personal project, use dataset from Kaggle.com to practice Python skills, data visualization and business knowledge.
Link Kaggle: https://www.kaggle.com/code/trmnguynhi/customer-shopping-analysis
Business case
The dataset offers a comprehensive view of consumer shopping trends, aiming to uncover patterns and behaviors in retail purchasing. It contains detailed transactional data across various product categories, customer demographics, and purchase channels. Key features may include:
- Transaction Details: Purchase date, transaction value, product category, and payment method.
- Customer Information: Age group, gender, location, and loyalty status.
- Shopping Behavior: Frequency of purchases, average spend per transaction, and seasonal trends.
This analysis will contain three steps as below:
- Data Preparation
- Data Exploration
- Data Analysis & Insights
Data Preparation
import pandas as pd import matplotlib.pyplot as plt import numpy as np import plotly.express as px import seaborn as snsshopping_data = pd.read_csv("/kaggle/input/customer-shopping-latest-trends-dataset/shopping_trends.csv")
Data Exploration
shopping_data.info()
shopping_data.describe()

shopping_data.head(10)

Data Analysis
I. Customer Segment
1. Geographic
customer_by_location = shopping_data.groupby("Location").agg({"Customer ID":"count"}).reset_index()state_name_to_code = { 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA' , 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA' , 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA' , 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD' , 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO' , 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM' , 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR' , 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN' , 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV' , 'Wisconsin': 'WI', 'Wyoming': 'WY' } customer_by_location['Location code']= customer_by_location['Location'].map(state_name_to_code)
fig = px.choropleth(customer_by_location, locationmode='USA-states',locations= 'Location code'
, color = "Customer ID"
, color_continuous_scale= 'viridis_r'
, hover_data= {'Location': True, 'Location code': False}
, labels={'Location':'State name','Customer ID': 'Number of Customers'}
, scope='usa'
, title = 'Number of Customers by Location'
)
fig.show()

puramt_by_location = shopping_data.groupby("Location").agg({"Purchase Amount (USD)":"mean"}).reset_index()
puramt_by_location['Location code']= puramt_by_location['Location'].map(state_name_to_code)
px.choropleth(puramt_by_location, locationmode='USA-states',locations= 'Location code'
, color = "Purchase Amount (USD)"
, color_continuous_scale= 'viridis_r'
, hover_data= {'Location': True, 'Location code': False}
, labels={'Location':'State name','Purchase Amount (USD)': 'Total Purchase Amt (USD)'}
, scope='usa'
, title = 'Purchase Amt by Location'
)

From the map, the number of customers distributes more densely in the West of USA (California and Montana). However, regarding the total purchase amount, Alaska, Arizona and Pennsylvania accounts for the highest amount.
2. Demographic
2.1 Gender
data_by_gender = shopping_data.groupby("Gender").agg(Customer_Count=("Customer ID", "count")
, Total_Purchase_Amount=("Purchase Amount (USD)", "sum")
, Average_Purchase_Amount=("Purchase Amount (USD)", "mean")).reset_index()
data_by_gender

fig, ax = plt.subplots(1,2)
colors = ['#ff69b4','#1f77b4']
ax[0].pie(data_by_gender['Customer_Count'], labels= data_by_gender['Gender'], colors = colors, autopct='%.0f%%')
ax[0].set_title('Customer Distribution by Gender', size =10)
ax[1].pie(data_by_gender['Total_Purchase_Amount'], labels= data_by_gender['Gender'], colors = colors, autopct='%.0f%%')
ax[1].set_title('Purchase Amount by Gender', size =10)

The number of male customers is double that of female customers, but their average purchase amount is nearly the same.
frequency_of_purchases = shopping_data.groupby(['Gender','Frequency of Purchases']).agg(Count = ('Frequency of Purchases','count')).reset_index()
frequency_of_purchases_female = frequency_of_purchases[frequency_of_purchases['Gender']=='Female']
frequency_of_purchases_male = frequency_of_purchases[frequency_of_purchases['Gender']=='Male']
fig,ax=plt.subplots(1,2)
ax[0].pie(frequency_of_purchases_female['Count'],labels= frequency_of_purchases_female['Frequency of Purchases']
, autopct='%.0f%%', textprops={'fontsize': 7})
ax[0].set_title("Female", size =10)
ax[1].pie(frequency_of_purchases_male['Count'],labels= frequency_of_purchases_male['Frequency of Purchases']
, autopct='%.0f%%', textprops={'fontsize': 7})
ax[1].set_title("Male", size =10)
fig.suptitle('Purchase Frequency by Gender', fontsize=10)
plt.subplots_adjust(wspace=0.8, top=1.2)

The frequency of purchase distribution is nearly the same between each groups, and between male and female.
color_by_gender = shopping_data.groupby(['Gender','Color']).agg(Count=('Color','count')).reset_index()
color_by_gender_female = color_by_gender[color_by_gender['Gender']=='Female']
color_by_gender_male = color_by_gender[color_by_gender['Gender']=='Male']
color_by_gender_female = color_by_gender_female.sort_values(by='Count', ascending= False).head(5)
color_by_gender_male = color_by_gender_male.sort_values(by='Count', ascending= False).head(5)
palette1 = color_by_gender_female['Color'].tolist()
palette2 = color_by_gender_male['Color'].tolist()
fig, ax = plt.subplots(1,2, figsize = (7,3))
sns.barplot(color_by_gender_female, y= 'Color', x= 'Count', palette= palette1,ax= ax[0])
ax[0].set_title('Female')
sns.barplot(color_by_gender_male, y= 'Color', x= 'Count', palette= palette2,ax= ax[1])
ax[1].set_title('Male')
plt.subplots_adjust(wspace=0.5, top =0.8)
plt.suptitle('Top 5 Most Favorite Color by Gender')

Women tend to like vibrant colors, while men prefer more muted colors.
2.2 Age:
I divide Age into 6 groups for analysis
shopping_data['Age_Group']=['18 to 20' if i<=20 else
'21 to 30' if i<=30 else
'31 to 40' if i <=40 else
'41 to 50' if i <=50 else
'51 to 60' if i <=60 else
'61 to 70' for i in shopping_data['Age']]
age_distribution = shopping_data.groupby('Age_Group').agg(NumberofCustomer = ('Age_Group','count'),Avg_Purchase_Amount=("Purchase Amount (USD)", "mean")).reset_index()
fig, ax = plt.subplots()
sns.barplot(age_distribution, x = 'Age_Group', y= 'NumberofCustomer', ax= ax)
ax2 = ax.twinx()
sns.lineplot(age_distribution, x = 'Age_Group', y= 'Avg_Purchase_Amount',color = 'black', ax= ax2 )
ax.set_xlabel('Age Group')
ax.set_ylabel('Number of Customers')
ax2.set_ylabel('Average Purchase Amount (USD)')

Age Group from 21 to 30, which the a group that has the most stable income and is in comfortable finance
payment_method_by_age = shopping_data.groupby(['Age_Group','Payment Method']).agg(Count = ('Payment Method','count')).reset_index()
sns.barplot(payment_method_by_age, x = 'Age_Group', y= 'Count', hue= 'Payment Method')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

Cash is most used for Age Group below 20 and above 51, who are less have connection with technology than others. Most of group prefer using Credit Card than Debit Card.
shipping_type_by_age = shopping_data.groupby(['Age_Group','Shipping Type']).agg(Count = ('Customer ID','count')).reset_index()
# shipping_type_by_age
fig,ax = plt.subplots(3,2, figsize=(20, 14))
ax = ax.flatten()
for i, age_group in enumerate(shipping_type_by_age['Age_Group'].unique()):
age_data = shipping_type_by_age[shipping_type_by_age['Age_Group'] == age_group]
ax[i].pie(age_data['Count'], labels=age_data['Shipping Type'], autopct='%.0f%%', textprops= {'fontsize': 8})
ax[i].set_title(f'Age Group: {age_group}')
plt.tight_layout()

II. Product Categories
Color & Season
color_by_season = shopping_data.groupby(['Season','Color']).agg(TotalCustomer = ('Customer ID', 'count') ).reset_index()
top_colors = color_by_season.groupby('Season').apply(lambda x: x.nlargest(3, 'TotalCustomer')).reset_index(drop=True)
top_colors

prefered_size = shopping_data['Size'].value_counts().reset_index()
prefered_size.columns = ['Size', 'Count']
sns.barplot(prefered_size, x= 'Size', y = 'Count')

Based on favourite colors of each season, we can develop strategies to make products with those colors to boost sales. Based on prefered size, company can arrange the product line to avoid out of stock
Review Rating & Categories
rating_by_cat = shopping_data.groupby(['Category']).agg(AvgRating= ('Review Rating','mean')).reset_index()

Promo Used and Discount Applied
promo_used = shopping_data['Promo Code Used'].value_counts().reset_index()
plt.pie(promo_used['count'], labels= promo_used['Promo Code Used'], autopct= '%.0f%%')
plt.title('Promo Used Distribution')

discount_applied = shopping_data['Discount Applied'].value_counts().reset_index()
plt.pie(discount_applied['count'], labels= discount_applied['Discount Applied'], autopct= '%.0f%%')
plt.title('Discount Applied Distribution')
