๐Ÿ” Sales Performance

from dotenv import load_dotenv
import os
import ibis
from ibis import _
ibis.options.interactive = True

# Load environment variables
load_dotenv()

# Retrieve credentials
user = os.getenv("DB_USER_P")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
database = os.getenv("DB_NAME_P")

# Create the connection string
connection_string = f"{user}://{user}:{password}@{host}:5432/{database}"

# Connect to the database
con = ibis.connect(connection_string)

# Access a table
t = con.table("sales_orders")
t
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ account_num โ”ƒ account_name                      โ”ƒ sku      โ”ƒ category โ”ƒ quantity โ”ƒ unit_price โ”ƒ ext_price โ”ƒ date                โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ int64       โ”‚ string                            โ”‚ string   โ”‚ string   โ”‚ int64    โ”‚ float64    โ”‚ float64   โ”‚ timestamp(6)        โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚      803666 โ”‚ Fritsch-Glover                    โ”‚ HX-24728 โ”‚ Hat      โ”‚        1 โ”‚      98.98 โ”‚     98.98 โ”‚ 2014-09-28 11:56:02 โ”‚
โ”‚       64898 โ”‚ O'Conner Inc                      โ”‚ LK-02338 โ”‚ Sweater  โ”‚        9 โ”‚      34.80 โ”‚    313.20 โ”‚ 2014-04-24 16:51:22 โ”‚
โ”‚      423621 โ”‚ Beatty and Sons                   โ”‚ ZC-07383 โ”‚ Sweater  โ”‚       12 โ”‚      60.24 โ”‚    722.88 โ”‚ 2014-09-17 17:26:22 โ”‚
โ”‚      137865 โ”‚ Gleason, Bogisich and Franecki    โ”‚ QS-76400 โ”‚ Sweater  โ”‚        5 โ”‚      15.25 โ”‚     76.25 โ”‚ 2014-01-30 07:34:02 โ”‚
โ”‚      435433 โ”‚ Morissette-Heathcote              โ”‚ RU-25060 โ”‚ Sweater  โ”‚       19 โ”‚      51.83 โ”‚    984.77 โ”‚ 2014-08-24 06:18:12 โ”‚
โ”‚      198887 โ”‚ Shanahan-Bartoletti               โ”‚ FT-50146 โ”‚ Sweater  โ”‚        4 โ”‚      18.51 โ”‚     74.04 โ”‚ 2014-09-05 07:24:23 โ”‚
โ”‚      969663 โ”‚ Gusikowski, Reichert and Gerlach  โ”‚ AE-95093 โ”‚ Socks    โ”‚        4 โ”‚      49.95 โ”‚    199.80 โ”‚ 2014-04-28 21:51:24 โ”‚
โ”‚        1288 โ”‚ Wilderman, Herman and Breitenberg โ”‚ FT-50146 โ”‚ Sweater  โ”‚       14 โ”‚      68.20 โ”‚    954.80 โ”‚ 2013-12-04 13:53:26 โ”‚
โ”‚      979589 โ”‚ Brown Inc                         โ”‚ HX-24728 โ”‚ Hat      โ”‚       16 โ”‚      52.99 โ”‚    847.84 โ”‚ 2014-02-07 14:53:59 โ”‚
โ”‚      839884 โ”‚ Turcotte, Turner and Anderson     โ”‚ FT-50146 โ”‚ Sweater  โ”‚        8 โ”‚      21.35 โ”‚    170.80 โ”‚ 2014-09-03 16:06:44 โ”‚
โ”‚           โ€ฆ โ”‚ โ€ฆ                                 โ”‚ โ€ฆ        โ”‚ โ€ฆ        โ”‚        โ€ฆ โ”‚          โ€ฆ โ”‚         โ€ฆ โ”‚ โ€ฆ                   โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
  1. Which categories generate the most revenue?

    • Group by category, sum ext_price.
  2. What are the top-selling SKUs by quantity and revenue?

    • Aggregate quantity and ext_price per sku.
  3. Which accounts contribute the most to total sales?

    • Sum ext_price grouped by account_name or account_num.

๐Ÿ‘ค Customer Behavior

  1. Which customers purchase the highest quantity of items?

    • Sum quantity grouped by account_name.
  2. What is the average unit price customers are paying?

    • Average of unit_price, possibly by account_name or category.
  3. Are there customers who only buy from one category?

    • Count unique categories per account_name.

๐Ÿ’ฐ Product Pricing

  1. What is the price distribution of items in each category?

    • Use descriptive stats (min, max, mean, std) on unit_price by category.
  2. Are there categories with high unit prices but low sales volume?

    • Compare unit_price with total quantity per category.
  3. Do higher-priced products sell less frequently?

    • Correlate unit_price with quantity.

๐Ÿงพ Order Metrics

  1. What is the average size of an order (in quantity and price)?

    • Average quantity and ext_price.
  2. How many unique SKUs are sold per order/account?

    • Count distinct sku per account_num.
import polars as pl
# Move date to be first column
(t
 .relocate('date')
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ date                โ”ƒ account_num โ”ƒ account_name                      โ”ƒ sku      โ”ƒ category โ”ƒ quantity โ”ƒ unit_price โ”ƒ ext_price โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ timestamp(6)        โ”‚ int64       โ”‚ string                            โ”‚ string   โ”‚ string   โ”‚ int64    โ”‚ float64    โ”‚ float64   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ 2014-09-28 11:56:02 โ”‚      803666 โ”‚ Fritsch-Glover                    โ”‚ HX-24728 โ”‚ Hat      โ”‚        1 โ”‚      98.98 โ”‚     98.98 โ”‚
โ”‚ 2014-04-24 16:51:22 โ”‚       64898 โ”‚ O'Conner Inc                      โ”‚ LK-02338 โ”‚ Sweater  โ”‚        9 โ”‚      34.80 โ”‚    313.20 โ”‚
โ”‚ 2014-09-17 17:26:22 โ”‚      423621 โ”‚ Beatty and Sons                   โ”‚ ZC-07383 โ”‚ Sweater  โ”‚       12 โ”‚      60.24 โ”‚    722.88 โ”‚
โ”‚ 2014-01-30 07:34:02 โ”‚      137865 โ”‚ Gleason, Bogisich and Franecki    โ”‚ QS-76400 โ”‚ Sweater  โ”‚        5 โ”‚      15.25 โ”‚     76.25 โ”‚
โ”‚ 2014-08-24 06:18:12 โ”‚      435433 โ”‚ Morissette-Heathcote              โ”‚ RU-25060 โ”‚ Sweater  โ”‚       19 โ”‚      51.83 โ”‚    984.77 โ”‚
โ”‚ 2014-09-05 07:24:23 โ”‚      198887 โ”‚ Shanahan-Bartoletti               โ”‚ FT-50146 โ”‚ Sweater  โ”‚        4 โ”‚      18.51 โ”‚     74.04 โ”‚
โ”‚ 2014-04-28 21:51:24 โ”‚      969663 โ”‚ Gusikowski, Reichert and Gerlach  โ”‚ AE-95093 โ”‚ Socks    โ”‚        4 โ”‚      49.95 โ”‚    199.80 โ”‚
โ”‚ 2013-12-04 13:53:26 โ”‚        1288 โ”‚ Wilderman, Herman and Breitenberg โ”‚ FT-50146 โ”‚ Sweater  โ”‚       14 โ”‚      68.20 โ”‚    954.80 โ”‚
โ”‚ 2014-02-07 14:53:59 โ”‚      979589 โ”‚ Brown Inc                         โ”‚ HX-24728 โ”‚ Hat      โ”‚       16 โ”‚      52.99 โ”‚    847.84 โ”‚
โ”‚ 2014-09-03 16:06:44 โ”‚      839884 โ”‚ Turcotte, Turner and Anderson     โ”‚ FT-50146 โ”‚ Sweater  โ”‚        8 โ”‚      21.35 โ”‚    170.80 โ”‚
โ”‚ โ€ฆ                   โ”‚           โ€ฆ โ”‚ โ€ฆ                                 โ”‚ โ€ฆ        โ”‚ โ€ฆ        โ”‚        โ€ฆ โ”‚          โ€ฆ โ”‚         โ€ฆ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
## Q1
# underscore _ is shorthand for accessing columns. ibis accesses functions.

(t
 .aggregate(by='category', ext_price=_.ext_price.sum())
 .order_by(ibis.desc('ext_price'))
#  .order_by(lambda t: t.ext_price.desc())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ category โ”ƒ ext_price โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string   โ”‚ float64   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ Sweater  โ”‚ 301716.00 โ”‚
โ”‚ Socks    โ”‚ 169169.93 โ”‚
โ”‚ Hat      โ”‚  99294.01 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
#Q1 (idiomatic way)
(t
 .group_by('category')
 .agg(ext_price=_.ext_price.sum())
 .order_by(_.ext_price.desc())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ category โ”ƒ ext_price โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string   โ”‚ float64   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ Sweater  โ”‚ 301716.00 โ”‚
โ”‚ Socks    โ”‚ 169169.93 โ”‚
โ”‚ Hat      โ”‚  99294.01 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
(t
 .filter(t.date.year() == 2014)
 .mutate(month=t.date.strftime('%b-%Y'))
 .select('quantity','month')
 .pivot_wider(names_from='month', values_from='quantity', names_sort=True, values_agg='sum')
)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ Apr-2014 โ”ƒ Aug-2014 โ”ƒ Feb-2014 โ”ƒ Jan-2014 โ”ƒ Jul-2014 โ”ƒ Jun-2014 โ”ƒ Mar-2014 โ”ƒ May-2014 โ”ƒ Sep-2014 โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚      887 โ”‚     1065 โ”‚      935 โ”‚      833 โ”‚     1006 โ”‚      684 โ”‚      791 โ”‚      787 โ”‚      911 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜


month_list = (t
 .filter(t.date.year() == 2014)
 .mutate(month_num=_.date.month(),
         month=_.date.strftime('%b-%Y'))
 .select('month','month_num')
 .order_by('month_num')
 .distinct(on=['month', 'month_num'])
 .to_polars()
 ['month'].to_list()
 )
month_list
['Jan-2014',
 'Feb-2014',
 'Mar-2014',
 'Apr-2014',
 'May-2014',
 'Jun-2014',
 'Jul-2014',
 'Aug-2014',
 'Sep-2014']
# Now use the list of months
(t
 .filter(t.date.year() == 2014)
 .mutate(month=t.date.strftime('%b-%Y'))
 .select('quantity','month')
 .pivot_wider(names_from='month',values_from='quantity', values_agg='sum')
 .relocate(*[month_list])
)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ Jan-2014 โ”ƒ Feb-2014 โ”ƒ Mar-2014 โ”ƒ Apr-2014 โ”ƒ May-2014 โ”ƒ Jun-2014 โ”ƒ Jul-2014 โ”ƒ Aug-2014 โ”ƒ Sep-2014 โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚ int64    โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚      833 โ”‚      935 โ”‚      791 โ”‚      887 โ”‚      787 โ”‚      684 โ”‚     1006 โ”‚     1065 โ”‚      911 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
## Q2
(t
 .group_by('sku')
 .agg(quantity=_.quantity.sum(),
      ext_price=_.ext_price.sum())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ sku      โ”ƒ quantity โ”ƒ ext_price โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string   โ”‚ int64    โ”‚ float64   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ ZC-07383 โ”‚     1144 โ”‚  61483.76 โ”‚
โ”‚ AE-95093 โ”‚     1062 โ”‚  57146.97 โ”‚
โ”‚ XX-25746 โ”‚     1133 โ”‚  59384.57 โ”‚
โ”‚ FT-50146 โ”‚      995 โ”‚  53558.69 โ”‚
โ”‚ RU-25060 โ”‚     1167 โ”‚  62957.70 โ”‚
โ”‚ IC-59308 โ”‚     1240 โ”‚  67312.52 โ”‚
โ”‚ GS-95639 โ”‚      823 โ”‚  44710.44 โ”‚
โ”‚ QS-76400 โ”‚     1209 โ”‚  69386.07 โ”‚
โ”‚ LK-02338 โ”‚     1036 โ”‚  54329.78 โ”‚
โ”‚ HX-24728 โ”‚      756 โ”‚  39909.44 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
# Show top 5 customers that contribute the largest percentage to revenue

(t
 .group_by('account_name')
 .agg(ext_price=_.ext_price.sum())
 .mutate(percent=(_.ext_price / _.ext_price.sum()) * 100)
 .order_by(_.percent.desc())
 .head(5)
#  .limit(5)
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ account_name             โ”ƒ ext_price โ”ƒ percent  โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string                   โ”‚ float64   โ”‚ float64  โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ D'Amore PLC              โ”‚   3529.41 โ”‚ 0.618999 โ”‚
โ”‚ Wilderman Group          โ”‚   3466.92 โ”‚ 0.608040 โ”‚
โ”‚ Hilll, Schultz and Braun โ”‚   3390.29 โ”‚ 0.594600 โ”‚
โ”‚ Kuvalis-Roberts          โ”‚   3296.00 โ”‚ 0.578063 โ”‚
โ”‚ Mills Inc                โ”‚   2852.69 โ”‚ 0.500314 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
t.ext_price.sum()

โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ 570179.94 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
(t
 .nunique(where=t.account_name == "D'Amore PLC")
 )

โ”Œโ”€โ”€โ”€โ”
โ”‚ 4 โ”‚
โ””โ”€โ”€โ”€โ”˜
t.get_backend()
<ibis.backends.postgres.Backend at 0x1225ee210>
# Which year had the most revenue and what was the value?
(t
 .mutate(year=_.date.year())
 .group_by('year')
 .agg(total_sales=_.ext_price.sum())
 .filter(_.total_sales == _.total_sales.max())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ year  โ”ƒ total_sales โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ int32 โ”‚ float64     โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚  2014 โ”‚    425348.0 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
## Q2
(t
 .group_by('sku')
 .agg(quantity=_.quantity.sum(),
      ext_price=_.ext_price.sum())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ sku      โ”ƒ quantity โ”ƒ ext_price โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string   โ”‚ int64    โ”‚ float64   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ ZC-07383 โ”‚     1144 โ”‚  61483.76 โ”‚
โ”‚ AE-95093 โ”‚     1062 โ”‚  57146.97 โ”‚
โ”‚ XX-25746 โ”‚     1133 โ”‚  59384.57 โ”‚
โ”‚ FT-50146 โ”‚      995 โ”‚  53558.69 โ”‚
โ”‚ RU-25060 โ”‚     1167 โ”‚  62957.70 โ”‚
โ”‚ IC-59308 โ”‚     1240 โ”‚  67312.52 โ”‚
โ”‚ GS-95639 โ”‚      823 โ”‚  44710.44 โ”‚
โ”‚ QS-76400 โ”‚     1209 โ”‚  69386.07 โ”‚
โ”‚ LK-02338 โ”‚     1036 โ”‚  54329.78 โ”‚
โ”‚ HX-24728 โ”‚      756 โ”‚  39909.44 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
(t
 .group_by('sku')
 .agg(quantity=_.quantity.sum(),
      ext_price=_.ext_price.sum())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ sku      โ”ƒ quantity โ”ƒ ext_price โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string   โ”‚ int64    โ”‚ float64   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ ZC-07383 โ”‚     1144 โ”‚  61483.76 โ”‚
โ”‚ AE-95093 โ”‚     1062 โ”‚  57146.97 โ”‚
โ”‚ XX-25746 โ”‚     1133 โ”‚  59384.57 โ”‚
โ”‚ FT-50146 โ”‚      995 โ”‚  53558.69 โ”‚
โ”‚ RU-25060 โ”‚     1167 โ”‚  62957.70 โ”‚
โ”‚ IC-59308 โ”‚     1240 โ”‚  67312.52 โ”‚
โ”‚ GS-95639 โ”‚      823 โ”‚  44710.44 โ”‚
โ”‚ QS-76400 โ”‚     1209 โ”‚  69386.07 โ”‚
โ”‚ LK-02338 โ”‚     1036 โ”‚  54329.78 โ”‚
โ”‚ HX-24728 โ”‚      756 โ”‚  39909.44 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
import altair as alt

bar_chart = (alt
 .Chart(t.group_by('sku').agg(total_quantity=_.quantity.sum()))
 .mark_bar(color='#556B2F')
 .encode(x=alt.X('sku', axis=alt.Axis(labelAngle=0, labelFontSize=14), title=None,
                 sort=alt.EncodingSortField(field='total_quantity', order='descending')),
         y=alt.Y('total_quantity', axis=alt.Axis(labelFontSize=14), title=None),
         tooltip=['sku','total_quantity'])
 .properties(width=1000, height=500, 
             title={'text':'Total quantity by product SKU', 'fontSize': 20})
#  .configure_view(fill='#FFE4B5')
 .configure(background='#FFE4B5')
 .interactive()
 )
bar_chart
# What is the total, average, and median sales value for each day of the week. Sort values by average.
(t
 .mutate(day=_.date.day_of_week.full_name())
 .group_by('day')
 .agg(total_sales=_.ext_price.sum(),
      avg_sales=_.ext_price.mean(),
      median_sales=_.ext_price.median(),)
 .order_by(_.avg_sales.desc())
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ day       โ”ƒ total_sales โ”ƒ avg_sales  โ”ƒ median_sales โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string    โ”‚ float64     โ”‚ float64    โ”‚ float64      โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ Sunday    โ”‚    91575.29 โ”‚ 627.228014 โ”‚      560.825 โ”‚
โ”‚ Saturday  โ”‚    81985.61 โ”‚ 611.832910 โ”‚      512.260 โ”‚
โ”‚ Monday    โ”‚    88708.66 โ”‚ 591.391067 โ”‚      411.620 โ”‚
โ”‚ Tuesday   โ”‚    85035.91 โ”‚ 578.475578 โ”‚      470.890 โ”‚
โ”‚ Wednesday โ”‚    83444.28 โ”‚ 563.812703 โ”‚      454.200 โ”‚
โ”‚ Friday    โ”‚    69183.76 โ”‚ 528.120305 โ”‚      456.040 โ”‚
โ”‚ Thursday  โ”‚    70246.43 โ”‚ 487.822431 โ”‚      395.145 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
# What are the top three hours of the day with the least sales.

(t
 .mutate(hour=_.date.hour())
 .group_by('hour')
 .agg(total_sales=_.ext_price.sum())
 .order_by(_.total_sales.asc())
 .limit(3)
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ hour  โ”ƒ total_sales โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ int32 โ”‚ float64     โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚    16 โ”‚    13706.50 โ”‚
โ”‚    22 โ”‚    16160.71 โ”‚
โ”‚    21 โ”‚    16329.77 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
(t
 .select('category')
 .distinct()
 )
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ category โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string   โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ Socks    โ”‚
โ”‚ Sweater  โ”‚
โ”‚ Hat      โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
# Customers who bought all clothing products (loyal customers)

(t
 .group_by('account_name')
 .aggregate(category_num=t.category.nunique())
 .filter(_.category_num == 3)
)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ account_name                 โ”ƒ category_num โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string                       โ”‚ int64        โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ Bashirian, Beier and Watsica โ”‚            3 โ”‚
โ”‚ Beier-Bosco                  โ”‚            3 โ”‚
โ”‚ Fritsch-Glover               โ”‚            3 โ”‚
โ”‚ Halvorson PLC                โ”‚            3 โ”‚
โ”‚ Herman Ltd                   โ”‚            3 โ”‚
โ”‚ Koepp-McLaughlin             โ”‚            3 โ”‚
โ”‚ Kuvalis-Roberts              โ”‚            3 โ”‚
โ”‚ Ledner-Kling                 โ”‚            3 โ”‚
โ”‚ Mills Inc                    โ”‚            3 โ”‚
โ”‚ Schultz Group                โ”‚            3 โ”‚
โ”‚ โ€ฆ                            โ”‚            โ€ฆ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
# If we didn't know beforehand how many categories we had
(t
 .group_by('account_name')
 .aggregate(category_num=t.category.nunique())
 .filter(_.category_num == t.select(t.category).distinct().count())
)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ account_name                 โ”ƒ category_num โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ string                       โ”‚ int64        โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ Bashirian, Beier and Watsica โ”‚            3 โ”‚
โ”‚ Beier-Bosco                  โ”‚            3 โ”‚
โ”‚ Fritsch-Glover               โ”‚            3 โ”‚
โ”‚ Halvorson PLC                โ”‚            3 โ”‚
โ”‚ Herman Ltd                   โ”‚            3 โ”‚
โ”‚ Koepp-McLaughlin             โ”‚            3 โ”‚
โ”‚ Kuvalis-Roberts              โ”‚            3 โ”‚
โ”‚ Ledner-Kling                 โ”‚            3 โ”‚
โ”‚ Mills Inc                    โ”‚            3 โ”‚
โ”‚ Schultz Group                โ”‚            3 โ”‚
โ”‚ โ€ฆ                            โ”‚            โ€ฆ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
# Count the actual number of loyal customers

(t
 .group_by('account_name')
 .aggregate(category_num=t.category.nunique())
 .filter(_.category_num == t.select(t.category).distinct().count())
 .count()
)

โ”Œโ”€โ”€โ”€โ”€โ”
โ”‚ 11 โ”‚
โ””โ”€โ”€โ”€โ”€โ”˜
(t
 .filter(t.date.year() == 2014)
 .mutate(month=t.date.strftime('%b-%Y'))
 .select('quantity','month')
 .pivot_wider(names_from='month',values_from='quantity', values_agg='sum')
 .relocate(*[month_list])
)
(t
#  .filter(t.date.year() == 2014)
 .mutate(year=_.date.year().cast('string'),
         date=_.date.date().truncate('M'))
 .select('date','quantity','year')
 .pivot_wider(id_cols='date', names_from='year', values_from='quantity', values_agg='sum')
#  .relocate(*[month_list])
)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“
โ”ƒ date       โ”ƒ 2013  โ”ƒ 2014  โ”ƒ
โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ
โ”‚ date       โ”‚ int64 โ”‚ int64 โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚ 2014-02-01 โ”‚  NULL โ”‚   935 โ”‚
โ”‚ 2014-06-01 โ”‚  NULL โ”‚   684 โ”‚
โ”‚ 2013-12-01 โ”‚  1078 โ”‚  NULL โ”‚
โ”‚ 2013-11-01 โ”‚   825 โ”‚  NULL โ”‚
โ”‚ 2014-08-01 โ”‚  NULL โ”‚  1065 โ”‚
โ”‚ 2014-04-01 โ”‚  NULL โ”‚   887 โ”‚
โ”‚ 2014-07-01 โ”‚  NULL โ”‚  1006 โ”‚
โ”‚ 2013-09-01 โ”‚     6 โ”‚  NULL โ”‚
โ”‚ 2014-05-01 โ”‚  NULL โ”‚   787 โ”‚
โ”‚ 2014-03-01 โ”‚  NULL โ”‚   791 โ”‚
โ”‚ โ€ฆ          โ”‚     โ€ฆ โ”‚     โ€ฆ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
(t
 .rename('snake_case') #change cols to snake case
 .mutate(hour=lambda t: t.date.hour(),
         week_day=lambda t: t.date.day_of_week.full_name(),
         day_abbrev=t.date.day_of_week.full_name().substr(0, 3),
         year=t.date.year(),
         month=t.date.month(),
         minute=t.date.minute(),
         second=t.date.second(),
         )
#  .filter(lambda t: t.second == 2)
#  .execute()
 )
(t
 .to_polars()
 .group_by('account_name')
 .agg(category_num=pl.col('category').unique().count())
 .filter(pl.col('category_num') == 3)
 .unique('account_name')
#  .select('account_name','category')
#  .head(4)
#  .write_clipboard()
#  .with_columns(pl.col('date').dt.weekday())
#  .group_by('account_name')
#  .agg(pl.sum('ext_price'))
#  .with_columns(percent=pl.col('ext_price') / pl.col('ext_price').sum())
#  .sort('percent', descending=True)
 )
shape: (11, 2)
account_name category_num
str u32
"Beier-Bosco" 3
"Koepp-McLaughlin" 3
"Herman Ltd" 3
"Halvorson PLC" 3
"Mills Inc" 3
โ€ฆ โ€ฆ
"Schultz Group" 3
"Upton, Runolfsson and O'Reilly" 3
"Fritsch-Glover" 3
"Kuvalis-Roberts" 3
"Bashirian, Beier and Watsica" 3

Connecting to the database

import mysql.connector
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve database credentials from environment variables
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
database = os.getenv("DB_NAME_IBIS")

# Connect to the database
cnx = mysql.connector.connect(
    user=user,
    password=password,
    host=host,
    database=database
)
import polars as pl

df = pl.read_ndjson('how_to_get_rich.json')
(df
 .with_columns(pl.from_epoch(pl.col("time_parsed"), time_unit="s"))
)

Reading data from database

import polars as pl

actor = pl.read_database(
    query="SELECT * FROM actor",
    connection=cnx,
)  
actor
# Read all the tables

tables_df = pl.read_clipboard()
tables_df
import polars as pl

# List of table names
tables = tables_df['Tables_in_sakila'].to_list()

# Dictionary to store DataFrames
dataframes = {table: pl.read_database(query=f"SELECT * FROM {table}", connection=cnx) for table in tables}

# Manually unpack the dictionary into variables if needed
city, address = dataframes["city"], dataframes["address"]
city
# Unpacking dynamically into variables
globals().update(dataframes)
actor

Query warm up

Select first and last name from customer where last name is Ziegler.

SELECT first_name, last_name
FROM customer
WHERE last_name = 'ZIEGLER';
(customer
 .select('first_name','last_name')
 .filter(pl.col('last_name') == 'ZIEGLER')
 )

Display the category table

SELECT *
FROM category;
category

Subsetting columns - select name from the language table.

SELECT name
FROM language;
(language
 .select('name')
 )

Using expressions, literal and built-in function - Creating columns not present in the table.

SELECT language_id,
    'COMMON' language_usage,
    language_id * 3.1415927 lang_pi_value,
    upper(name) language_name
FROM language;
(language
 .with_columns(language_usage=pl.lit('COMMON'),
               lang_pi_value=pl.col('language_id') * 3.1415927,
               language_name=pl.col('name').str.to_uppercase()
               )
 .select('language_id','language_usage','lang_pi_value','language_name')
 )

Column aliases - the previous query can also be written as follows.

SELECT language_id,
    'COMMON' AS language_usage,
    language_id * 3.1415927 AS lang_pi_value,
    upper(name) AS language_name
FROM language;

Removing duplicates - Show ids of all actors who appeared in a film.

SELECT actor_id
FROM film_actor
ORDER BY actor_id;

Use DISTINCT to show unique ids.

SELECT DISTINCT actor_id
FROM film_actor
ORDER BY actor_id;
# with duplicates
(film_actor
 .select('actor_id')
 .sort('actor_id')
 )
# without duplicates
(film_actor
 .select('actor_id')
 .unique()
 .sort('actor_id')
 )

Subqueries - Create single column to show first and last name for customers named Jessie

SELECT concat(cust.last_name, ', ', cust.first_name) full_name
FROM
    (SELECT first_name, last_name, email
    FROM customer
    WHERE first_name = 'JESSIE'
    ) cust;
(customer
 .with_columns(pl.concat_str(['first_name','last_name'], separator=', ').alias('full_name'))
 .filter(pl.col('first_name') == 'JESSIE')
 .select('full_name')
 )

Show names and time of people who rented movies on 2005-06-14

SELECT customer.first_name, customer.last_name,
    time(rental.rental_date) rental_time
FROM customer
    INNER JOIN rental
    ON customer.customer_id = rental.customer_id
WHERE date(rental.rental_date) = '2005-06-14';
(customer
 .join(rental, on='customer_id', how='inner')
 .with_columns(Date=pl.col('rental_date').dt.date(),
               rental_time=pl.col('rental_date').dt.time())
 .filter(pl.col('Date') == pl.date(2005,6,14))
 .select('first_name', 'last_name', 'rental_time')
 )