Data Preprocessing with Python
# Install required packages
!pip install kaggle
!pip install sqlalchemy
!pip install pyodbc
# Download dataset from Kaggle
import kaggle
# https://www.kaggle.com/datasets/ankitbansal06/retail-orders
!kaggle datasets download ankitbansal06/retail-orders
# Unzip the downloaded file
import zipfile
with zipfile.ZipFile('retail-orders.zip', 'r') as zip_ref:
zip_ref.extractall('retail-orders')
# Read the data
import pandas as pd
df = pd.read_csv('D:\\data-analysis\\retail-orders\\orders.csv',
na_values = ['Not Available', 'unknown'])
# Check data and unique values
df.head(20)
df['Ship Mode'].unique()
df.isna().sum()
# Clean column names - convert to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
# Create derived columns for analysis
df['discount'] = df['list_price']*df['discount_percent']*.01
df['sale_price'] = df['list_price'] - df['discount']
df['profit'] = df['sale_price'] - df['cost_price']
# Convert date datatype
df['order_date'] = pd.to_datetime(df['order_date'], format='%Y-%m-%d')
# Drop unnecessary columns
df.drop(columns = ['list_price', 'cost_price', 'discount_percent'], inplace=True)
# Connect to SQL Server and load data
import sqlalchemy as sal
engine = sal.create_engine('mssql://Foxy_Saravana/master?driver=ODBC+DRIVER+17+FOR+SQL+SERVER')
conn = engine.connect()
# Load the data into SQL Server
df.to_sql('df_orders', con=conn, index=False, if_exists='append')
SQL Analysis Queries
-- Create table schema
CREATE TABLE df_orders(
order_id INT PRIMARY KEY,
order_date DATE,
ship_mode VARCHAR(20),
segment VARCHAR(20),
country VARCHAR(20),
city VARCHAR(20),
state VARCHAR(20),
postal_code VARCHAR(20),
region VARCHAR(20),
category VARCHAR(20),
sub_category VARCHAR(20),
product_id VARCHAR(20),
quantity INT,
discount DECIMAL(7, 2),
sale_price DECIMAL(7, 2),
profit DECIMAL(7, 2)
)
-- Find top 10 highest revenue generating products
SELECT TOP 10 product_id, SUM(sale_price) AS sales
FROM df_orders
GROUP BY product_id
ORDER BY sales DESC;
-- Find top 5 highest selling products in each region
WITH cte AS (
SELECT region, product_id, SUM(sale_price) AS sales
FROM df_orders
GROUP BY region, product_id
)
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER(PARTITION BY region ORDER BY sales DESC) AS rn
FROM cte
) A
WHERE rn <= 5;
-- Find month over month growth comparison for 2022 and 2023 sales
WITH cte AS (
SELECT YEAR(order_date) AS order_year,
MONTH(order_date) AS order_month,
SUM(sale_price) AS sales
FROM df_orders
GROUP BY YEAR(order_date), MONTH(order_date)
)
SELECT order_month
, SUM(CASE WHEN order_year = 2022 THEN sales ELSE 0 END) AS sales_2022
, SUM(CASE WHEN order_year = 2023 THEN sales ELSE 0 END) AS sales_2023
FROM cte
GROUP BY order_month
ORDER BY order_month;
-- For each category which month had highest sales
WITH cte AS (
SELECT category, FORMAT(order_date, 'yyyyMM') AS order_ymonth,
SUM(sale_price) AS sales
FROM df_orders
GROUP BY category, FORMAT(order_date, 'yyyyMM')
)
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY category ORDER BY sales DESC) AS rn
FROM cte
) a
WHERE rn = 1;
-- Which sub category had highest growth by profit in 2023 compared to 2022
WITH cte AS (
SELECT sub_category,
YEAR(order_date) AS order_year,
SUM(sale_price) AS sales
FROM df_orders
GROUP BY sub_category, YEAR(order_date)
),
cte2 AS (
SELECT sub_category
, SUM(CASE WHEN order_year = 2022 THEN sales ELSE 0 END) AS sales_2022
, SUM(CASE WHEN order_year = 2023 THEN sales ELSE 0 END) AS sales_2023
FROM cte
GROUP BY sub_category
)
SELECT TOP 1 *
,(sales_2023 - sales_2022)*100/sales_2022 AS growth_percent
FROM cte2
ORDER BY (sales_2023-sales_2022)*100/sales_2022 DESC;