import pandas as pd

data = pd.read_csv("russian_alcohol_consumption.csv")
data.head(10)


data.shape

(1615, 7)


print(data.year.unique())
print(len(data.year.unique()))

[1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
 2012 2013 2014 2015 2016]
19


print(data.region.unique())
print(len(data.region.unique()))

['Republic of Adygea' 'Altai Krai' 'Amur Oblast' 'Arkhangelsk Oblast'
 'Astrakhan Oblast' 'Republic of Bashkortostan' 'Belgorod Oblast'
 'Bryansk Oblast' 'Republic of Buryatia' 'Vladimir Oblast'
 'Volgograd Oblast' 'Vologda Oblast' 'Voronezh Oblast'
 'Republic of Dagestan' 'Jewish Autonomous Oblast' 'Zabaykalsky Krai'
 'Ivanovo Oblast' 'Republic of Ingushetia' 'Irkutsk Oblast'
 'Kabardino-Balkar Republic' 'Kaliningrad Oblast' 'Republic of Kalmykia'
 'Kaluga Oblast' 'Kamchatka Krai' 'Karachay-Cherkess Republic'
 'Republic of Karelia' 'Kemerovo Oblast' 'Kirov Oblast' 'Kostroma Oblast'
 'Krasnodar Krai' 'Krasnoyarsk Krai' 'Republic of Crimea' 'Kurgan Oblast'
 'Kursk Oblast' 'Leningrad Oblast' 'Lipetsk Oblast' 'Magadan Oblast'
 'Mari El Republic' 'Republic of Mordovia' 'Moscow' 'Moscow Oblast'
 'Murmansk Oblast' 'Nenets Autonomous Okrug' 'Nizhny Novgorod Oblast'
 'Novgorod Oblast' 'Novosibirsk Oblast' 'Omsk Oblast' 'Orenburg Oblast'
 'Oryol Oblast' 'Penza Oblast' 'Perm Krai' 'Primorsky Krai' 'Pskov Oblast'
 'Altai Republic' 'Komi Republic' 'Tuva Republic' 'Rostov Oblast'
 'Ryazan Oblast' 'Samara Oblast' 'Saint Petersburg' 'Saratov Oblast'
 'Sakhalin Oblast' 'Sverdlovsk Oblast' 'Sevastopol'
 'Republic of North Ossetia-Alania' 'Smolensk Oblast' 'Stavropol Krai'
 'Tambov Oblast' 'Republic of Tatarstan' 'Tver Oblast' 'Tomsk Oblast'
 'Tula Oblast' 'Tyumen Oblast' 'Udmurt Republic' 'Ulyanovsk Oblast'
 'Khabarovsk Krai' 'Republic of Khakassia'
 'Khanty–Mansi Autonomous Okrug – Yugra' 'Chelyabinsk Oblast'
 'Chechen Republic' 'Chuvash Republic' 'Chukotka Autonomous Okrug'
 'Sakha (Yakutia) Republic' 'Yamalo-Nenets Autonomous Okrug'
 'Yaroslavl Oblast']
85


data.dtypes

year           int64
region        object
wine         float64
beer         float64
vodka        float64
champagne    float64
brandy       float64
dtype: object


data.isna().sum()

year          0
region        0
wine         63
beer         58
vodka        61
champagne    63
brandy       66
dtype: int64


for region in data["region"].unique():
    temp = data[data["region"] == region]
    
    if temp.isna().sum().sum() > 0:
        print("\n" + "-"*30)
        print(f"For region {region}:")
        print(temp.isna().sum())

------------------------------
For region Republic of Ingushetia:
year          0
region        0
wine         12
beer         10
vodka        10
champagne    12
brandy       15
dtype: int64

------------------------------
For region Republic of Crimea:
year          0
region        0
wine         16
beer         16
vodka        16
champagne    16
brandy       16
dtype: int64

------------------------------
For region Sevastopol:
year          0
region        0
wine         16
beer         16
vodka        16
champagne    16
brandy       16
dtype: int64

------------------------------
For region Chechen Republic:
year          0
region        0
wine         19
beer         16
vodka        19
champagne    19
brandy       19
dtype: int64


regions_to_drop = ["Republic of Ingushetia", "Republic of Crimea", "Sevastopol", "Chechen Republic"]

data = data[~data.region.isin(regions_to_drop)]
print(data.isna().sum())
print("New shape of the DataFrame:", data.shape)

year         0
region       0
wine         0
beer         0
vodka        0
champagne    0
brandy       0
dtype: int64
New shape of the DataFrame: (1539, 7)


data_melted = data.melt(["year", "region"], var_name="drink", value_name="liters/capita")
data_melted.head()


from utils import plot_time_series_by_category

plot_time_series_by_category(data_melted, "year", "liters/capita",
                            "region", "drink", "simple_white")

plot_time_series_by_category(data_melted, "year", "liters/capita",
                            "region", "drink", "simple_white")


plot_time_series_by_category(data_melted, "year", "liters/capita",
                            "region", "drink", "simple_white", True)

plot_time_series_by_category(data_melted, "year", "liters/capita",
                            "region", "drink", "simple_white", True)


from dtaidistance.dtw import distance as dtw_distance
import numpy as np

# function to standardize the time series
def normalize_time_series(time_series):
    return (time_series - np.quantile(time_series, q=.5))/\
            (np.quantile(time_series, q=.9) - np.quantile(time_series, q=.1))

# the dtw distance is calculated between the correspondent time series of all the regions
distance_data = []
for region1 in data.region.unique():
    row_data = {}
    row_data["region"] = region1
    for drink in ["wine", "beer", "vodka", "champagne", "brandy"]:
        for region2 in data.region.unique():
            row_data[f"{drink} dist {region2}"] = dtw_distance(data[data["region"] == region1][drink].values,
                                                                           data[data["region"] == region2][drink].values)
    distance_data.append(row_data)

# dataframe with all the distances
distance_data = pd.DataFrame(distance_data)
distance_data.head()


distance_data.describe()


from sklearn.decomposition import PCA

pca = PCA(n_components=.95, # we want to explain at least 95% of the total variance
          random_state=669)
X = pca.fit_transform(distance_data.drop(columns="region"))

print(f"Total explained variance ratio by the first 2 components:", pca.explained_variance_ratio_[:2].sum()*100, "%")

X = pd.DataFrame(X, columns=[f"PC {i}" for i in range(1, pca.n_components_ + 1)])
X["region"] = distance_data["region"]
X.head()

Total explained variance ratio by the first 2 components: 91.11352441115945 %


import plotly.express as px
from sklearn.metrics.pairwise import euclidean_distances

# using the 4 principal components, calculate the euclidean distance
X["Distance to Saint Petersburg"] = euclidean_distances(X.drop(columns=["region"]),
                                                        X[X["region"] == "Saint Petersburg"].drop(columns=["region"]))

# visualizing the position of the regions
fig = px.scatter(X, x="PC 1", y="PC 2", hover_data = "region",
                 color="Distance to Saint Petersburg", color_continuous_scale='Turbo')
fig.update_layout(xaxis_title="Principal Component 1",
                  yaxis_title="Principal Component 2",
                  title="Alcohol consumption pattern: bidimensional projection of all regions",
                  template="simple_white")
fig.update_traces(marker=dict(size=9))
fig.show()


fig = px.bar(X.sort_values("Distance to Saint Petersburg", ascending=False),
             x="Distance to Saint Petersburg", y="region", orientation='h', template="simple_white",
             title="Alcohol consumption pattern: regions most similar to Saint Petersburg",
             color="Distance to Saint Petersburg",
             color_continuous_scale='Turbo',
             width=800, height=1400)
fig.show()

	year	region	wine	beer	vodka	champagne	brandy
0	1998	Republic of Adygea	1.9	8.8	3.4	0.3	0.1
1	1998	Altai Krai	3.3	19.2	11.3	1.1	0.1
2	1998	Amur Oblast	2.1	21.2	17.3	0.7	0.4
3	1998	Arkhangelsk Oblast	4.3	10.6	11.7	0.4	0.3
4	1998	Astrakhan Oblast	2.9	18.0	9.5	0.8	0.2
5	1998	Republic of Bashkortostan	1.8	17.5	10.7	0.9	0.2
6	1998	Belgorod Oblast	3.4	23.0	10.8	0.9	0.1
7	1998	Bryansk Oblast	3.4	32.4	9.7	0.5	0.1
8	1998	Republic of Buryatia	1.1	8.8	15.8	0.9	0.1
9	1998	Vladimir Oblast	1.5	16.6	16.8	0.5	0.1

	region	wine dist Republic of Adygea	wine dist Altai Krai	wine dist Amur Oblast	wine dist Arkhangelsk Oblast	wine dist Astrakhan Oblast	wine dist Republic of Bashkortostan	wine dist Belgorod Oblast	wine dist Bryansk Oblast	wine dist Republic of Buryatia	...	brandy dist Ulyanovsk Oblast	brandy dist Khabarovsk Krai	brandy dist Republic of Khakassia	brandy dist Khanty–Mansi Autonomous Okrug – Yugra	brandy dist Chelyabinsk Oblast	brandy dist Chuvash Republic	brandy dist Chukotka Autonomous Okrug	brandy dist Sakha (Yakutia) Republic	brandy dist Yamalo-Nenets Autonomous Okrug	brandy dist Yaroslavl Oblast
0	Republic of Adygea	0.000000	3.900026	7.269264	17.626883	3.513759	4.414941	3.125396	11.353400	3.984997	...	0.850529	0.938403	0.616523	1.895811	0.612372	0.843623	1.482194	0.696419	2.014671	0.919239
1	Altai Krai	3.900026	0.000000	6.587427	20.660663	3.291580	3.381080	4.382933	13.245214	6.676541	...	0.476340	2.031280	1.236770	3.261533	1.196871	0.448553	3.351477	0.855862	3.457514	1.715372
2	Amur Oblast	7.269264	6.587427	0.000000	11.679940	5.299858	9.647368	8.995115	5.654529	3.784455	...	0.335410	1.074104	0.676166	2.377646	0.464866	0.494368	2.456827	0.253180	2.580310	0.981886
3	Arkhangelsk Oblast	17.626883	20.660663	11.679940	0.000000	16.997803	24.876103	21.993792	4.748737	13.172813	...	1.780590	0.754785	1.340597	1.405276	1.363121	1.901894	0.947206	1.492180	1.548548	0.887187
4	Astrakhan Oblast	3.513759	3.291580	5.299858	16.997803	0.000000	5.969590	4.575194	10.618663	4.609610	...	0.180278	1.133005	0.615467	2.309372	0.382230	0.406940	2.337092	0.210000	2.513563	0.856796

	wine dist Republic of Adygea	wine dist Altai Krai	wine dist Amur Oblast	wine dist Arkhangelsk Oblast	wine dist Astrakhan Oblast	wine dist Republic of Bashkortostan	wine dist Belgorod Oblast	wine dist Bryansk Oblast	wine dist Republic of Buryatia	wine dist Vladimir Oblast	...	brandy dist Ulyanovsk Oblast	brandy dist Khabarovsk Krai	brandy dist Republic of Khakassia	brandy dist Khanty–Mansi Autonomous Okrug – Yugra	brandy dist Chelyabinsk Oblast	brandy dist Chuvash Republic	brandy dist Chukotka Autonomous Okrug	brandy dist Sakha (Yakutia) Republic	brandy dist Yamalo-Nenets Autonomous Okrug	brandy dist Yaroslavl Oblast
count	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	...	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000	81.000000
mean	9.327511	10.586655	7.514745	12.398218	8.580767	13.191111	11.370777	8.459765	8.090933	9.360812	...	0.984503	1.343979	1.177580	2.149788	1.087095	1.079536	2.187614	0.972232	2.309936	1.198920
std	5.716611	6.793815	4.665270	6.655064	5.830522	7.711322	6.899607	4.923744	4.500338	4.587454	...	1.005795	0.515729	0.723902	0.883784	0.800197	1.034073	0.866551	0.881617	0.941177	0.493985
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	4.955855	5.620374	4.022549	7.528307	4.178576	7.445435	5.882933	4.844915	4.847731	6.481929	...	0.300167	0.967316	0.687895	1.454510	0.559106	0.376298	1.482194	0.380263	1.656744	0.844156
50%	7.395066	8.878187	6.193626	11.333270	6.592056	11.002363	9.318846	7.400142	7.176078	8.628546	...	0.602080	1.374809	0.900222	2.348617	0.846995	0.657343	2.337092	0.605062	2.553586	1.169017
75%	12.367417	14.192428	9.647368	15.267279	11.452829	17.672705	15.537635	11.066526	10.607907	11.870594	...	1.089679	1.702645	1.338096	2.724702	1.285457	1.316055	2.753343	1.106616	2.923696	1.430035
max	29.256215	31.695591	22.943690	31.808007	28.718001	35.413811	33.275733	24.180529	23.772650	23.217969	...	3.704929	2.497138	3.187350	3.749827	3.358884	3.846505	3.827532	3.415567	3.939797	2.308679

	PC 1	PC 2	PC 3	PC 4	region
0	602.317963	-200.959146	-33.206956	-19.736541	Republic of Adygea
1	-97.656202	-250.621705	-63.924742	8.505152	Altai Krai
2	-256.190093	-97.716054	18.604340	18.419457	Amur Oblast
3	-25.303670	-18.988241	98.487830	172.392736	Arkhangelsk Oblast
4	-170.742741	-96.718758	-96.177554	12.022089	Astrakhan Oblast

❓ Problem: where should a drinks company run promotions?¶

📈 The data¶

✅ Solution proposed by this notebook¶

✨ Table of Contents¶

💾 Loading and processing data¶

🕵️‍♂️ Exploring the data¶

1️⃣ Comparing regions using the non-normalized time series for drinks¶

2️⃣ Comparing regions from normalized time series¶

📏 A number to compare: Dynamic Time Warping¶

💎 Final list!¶