In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
!head sales_data.csv

In [None]:
sales = pd.read_csv(
    'sales_data.csv',
    parse_dates=['Date'])

In [None]:
sales.head()

In [None]:
sales.shape

In [None]:
sales.info()

In [None]:
sales.describe()

### Numerical analysis and visualization

We'll analyze the `Unit_Cost` column:

In [None]:
sales['Unit_Cost'].describe()

In [None]:
sales['Unit_Cost'].mean()

In [None]:
sales['Unit_Cost'].median()

In [None]:
sales['Unit_Cost'].plot(kind='box', vert=False, figsize=(14,6))

In [None]:
sales['Unit_Cost'].plot(kind='density', figsize=(14,6)) # kde

In [None]:
ax = sales['Unit_Cost'].plot(kind='density', figsize=(14,6)) # kde
ax.axvline(sales['Unit_Cost'].mean(), color='red')
ax.axvline(sales['Unit_Cost'].median(), color='green')

In [None]:
ax = sales['Unit_Cost'].plot(kind='hist', figsize=(14,6))
ax.set_ylabel('Number of Sales')
ax.set_xlabel('dollars')

### Categorical analysis and visualization

We'll analyze the `Age_Group` column:

In [None]:
sales.head()

In [None]:
sales['Age_Group'].value_counts()

In [None]:
sales['Age_Group'].value_counts().plot(kind='pie', figsize=(6,6))

In [None]:
ax = sales['Age_Group'].value_counts().plot(kind='bar', figsize=(14,6))
ax.set_ylabel('Number of Sales')

### Relationship between the columns?

Can we find any significant relationship?

In [None]:
corr = sales.corr(numeric_only=True)

corr

In [None]:
fig = plt.figure(figsize=(8,8))
plt.matshow(corr, cmap='RdBu', fignum=fig.number)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical');
plt.yticks(range(len(corr.columns)), corr.columns);
cb = plt.colorbar()
cb.ax.tick_params(labelsize=10)

In [None]:
sales.plot(kind='scatter', x='Customer_Age', y='Revenue', figsize=(6,6))

In [None]:
sales.plot(kind='scatter', x='Revenue', y='Profit', figsize=(6,6))

In [None]:
ax = sales[['Profit', 'Age_Group']].boxplot(by='Age_Group', figsize=(10,6))
ax.set_ylabel('Profit')

In [None]:
boxplot_cols = ['Year', 'Customer_Age', 'Order_Quantity', 'Unit_Cost', 'Unit_Price', 'Profit']

sales[boxplot_cols].plot(kind='box', subplots=True, layout=(2,3), figsize=(14,8))

### Column wrangling

We can also create new columns or modify existing ones.

#### Add and calculate a new `Revenue_per_Age` column

Use this formula

$$ Revenue\_per\_Age = Revenue / Customer\_Age $$

### Add and calculate a new `Calculated_Cost` column

Use this formula

$$ Calculated\_Cost = Order\_Quantity * Unit\_Cost $$

See the relationship between `Cost` and `Profit` using a scatter plot.

### Add and calculate a new `Calculated_Revenue` column

Use this formula

$$ Calculated\_Revenue = Cost + Profit $$

### Modify all `Unit_Price` values adding 3% tax to them

### Selection & Indexing:

### Get all the sales made in the state of `Kentucky`

### Get the mean revenue of the `Adults (35-64)` sales group

### How many records belong to Age Group `Youth (<25)` or `Adults (35-64)`?

### Get the mean revenue of the sales group `Adults (35-64)` in `United States`

### Increase the revenue by 10% to every sale made in France