1 | Initial Setup
1.1 | Package Installation
numpy
andpandas
for basic data-manipulationsklearn
and associated modules to run our machine learning algorithmsscipy
associated modules (norm
) to compute the cosime similarity later onseaborn
to output graphs- Miscellaneous packages for flow of the notebook
! pip install utils
! pip install scikit-learn
import pandas as pd
import numpy as np
from numpy.linalg import norm
import scipy as sc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from scipy.linalg import norm
import seaborn as sns
from typing import List
from utils import *
import re
from collections import Counter
import sys
1.2 | Connecting to Github and Google Drive
- This notebook is connected to the associated Github repository for easy access and version contol
- Optionally, it is also tested and connected to Google Drive if possible
if 'google.colab' in sys.modules:
%cd /content
!rm -rf Datathon_F22
!git clone https://github.com/sunnydigital/Datathon_F22
%cd Datathon_F22
if 'google.colab' in sys.modules:
from google.colab import drive
drive.mount('/content/drive')
1.3 | Preprocessing Product Category Names
- After initally reading in the
product_category_name_translation.csv
we need to transform the data into a useable format - An initial inspection of the number of products uder each category is displayed
- The english translations for product names are given (from the associated column in the
product_category_name_translation.csv
) - For each category, the individual words making up the category is split using
re.split
trans_df = pd.read_csv('datasets/product_category_name_translation.csv')
og_cat_name = trans_df['product_category_name'].to_numpy()
og_cat_name = np.append(og_cat_name, ['portateis_cozinha_e_preparadores_de_alimentos', 'pc_gamer', 'pcs', 'other'])
og_name_to_idx = {}
for i, name in enumerate(og_cat_name):
og_name_to_idx[name] = i
display(og_name_to_idx)
product_categories = trans_df['product_category_name_english'].to_numpy()
product_categories = np.append(product_categories, ['portateis_cozinha_e_preparadores_de_alimentos', 'pc_gamer', 'pcs', 'other'])
display(product_categories)
product_categories_list = []
for category in product_categories:
product_categories_list.append(re.split('_| ', category))
product_categories_list
{'beleza_saude': 0,
'informatica_acessorios': 1,
'automotivo': 2,
'cama_mesa_banho': 3,
'moveis_decoracao': 4,
'esporte_lazer': 5,
'perfumaria': 6,
'utilidades_domesticas': 7,
'telefonia': 8,
'relogios_presentes': 9,
'alimentos_bebidas': 10,
'bebes': 11,
'papelaria': 12,
'tablets_impressao_imagem': 13,
'brinquedos': 14,
'telefonia_fixa': 15,
'ferramentas_jardim': 16,
'fashion_bolsas_e_acessorios': 17,
'eletroportateis': 18,
'consoles_games': 19,
'audio': 20,
'fashion_calcados': 21,
'cool_stuff': 22,
'malas_acessorios': 23,
'climatizacao': 24,
'construcao_ferramentas_construcao': 25,
'moveis_cozinha_area_de_servico_jantar_e_jardim': 26,
'construcao_ferramentas_jardim': 27,
'fashion_roupa_masculina': 28,
'pet_shop': 29,
'moveis_escritorio': 30,
'market_place': 31,
'eletronicos': 32,
'eletrodomesticos': 33,
'artigos_de_festas': 34,
'casa_conforto': 35,
'construcao_ferramentas_ferramentas': 36,
'agro_industria_e_comercio': 37,
'moveis_colchao_e_estofado': 38,
'livros_tecnicos': 39,
'casa_construcao': 40,
'instrumentos_musicais': 41,
'moveis_sala': 42,
'construcao_ferramentas_iluminacao': 43,
'industria_comercio_e_negocios': 44,
'alimentos': 45,
'artes': 46,
'moveis_quarto': 47,
'livros_interesse_geral': 48,
'construcao_ferramentas_seguranca': 49,
'fashion_underwear_e_moda_praia': 50,
'fashion_esporte': 51,
'sinalizacao_e_seguranca': 52,
'pcs': 73,
'artigos_de_natal': 54,
'fashion_roupa_feminina': 55,
'eletrodomesticos_2': 56,
'livros_importados': 57,
'bebidas': 58,
'cine_foto': 59,
'la_cuisine': 60,
'musica': 61,
'casa_conforto_2': 62,
'portateis_casa_forno_e_cafe': 63,
'cds_dvds_musicais': 64,
'dvds_blu_ray': 65,
'flores': 66,
'artes_e_artesanato': 67,
'fraldas_higiene': 68,
'fashion_roupa_infanto_juvenil': 69,
'seguros_e_servicos': 70,
'portateis_cozinha_e_preparadores_de_alimentos': 71,
'pc_gamer': 72,
'other': 74}
array(['health_beauty', 'computers_accessories', 'auto', 'bed_bath_table',
'furniture_decor', 'sports_leisure', 'perfumery', 'housewares',
'telephony', 'watches_gifts', 'food_drink', 'baby', 'stationery',
'tablets_printing_image', 'toys', 'fixed_telephony',
'garden_tools', 'fashion_bags_accessories', 'small_appliances',
'consoles_games', 'audio', 'fashion_shoes', 'cool_stuff',
'luggage_accessories', 'air_conditioning',
'construction_tools_construction',
'kitchen_dining_laundry_garden_furniture',
'costruction_tools_garden', 'fashion_male_clothing', 'pet_shop',
'office_furniture', 'market_place', 'electronics',
'home_appliances', 'party_supplies', 'home_confort',
'costruction_tools_tools', 'agro_industry_and_commerce',
'furniture_mattress_and_upholstery', 'books_technical',
'home_construction', 'musical_instruments',
'furniture_living_room', 'construction_tools_lights',
'industry_commerce_and_business', 'food', 'art',
'furniture_bedroom', 'books_general_interest',
'construction_tools_safety', 'fashion_underwear_beach',
'fashion_sport', 'signaling_and_security', 'computers',
'christmas_supplies', 'fashio_female_clothing',
'home_appliances_2', 'books_imported', 'drinks', 'cine_photo',
'la_cuisine', 'music', 'home_comfort_2',
'small_appliances_home_oven_and_coffee', 'cds_dvds_musicals',
'dvds_blu_ray', 'flowers', 'arts_and_craftmanship',
'diapers_and_hygiene', 'fashion_childrens_clothes',
'security_and_services',
'portateis_cozinha_e_preparadores_de_alimentos', 'pc_gamer', 'pcs',
'other'], dtype=object)
[['health', 'beauty'],
['computers', 'accessories'],
['auto'],
['bed', 'bath', 'table'],
['furniture', 'decor'],
['sports', 'leisure'],
['perfumery'],
['housewares'],
['telephony'],
['watches', 'gifts'],
['food', 'drink'],
['baby'],
['stationery'],
['tablets', 'printing', 'image'],
['toys'],
['fixed', 'telephony'],
['garden', 'tools'],
['fashion', 'bags', 'accessories'],
['small', 'appliances'],
['consoles', 'games'],
['audio'],
['fashion', 'shoes'],
['cool', 'stuff'],
['luggage', 'accessories'],
['air', 'conditioning'],
['construction', 'tools', 'construction'],
['kitchen', 'dining', 'laundry', 'garden', 'furniture'],
['costruction', 'tools', 'garden'],
['fashion', 'male', 'clothing'],
['pet', 'shop'],
['office', 'furniture'],
['market', 'place'],
['electronics'],
['home', 'appliances'],
['party', 'supplies'],
['home', 'confort'],
['costruction', 'tools', 'tools'],
['agro', 'industry', 'and', 'commerce'],
['furniture', 'mattress', 'and', 'upholstery'],
['books', 'technical'],
['home', 'construction'],
['musical', 'instruments'],
['furniture', 'living', 'room'],
['construction', 'tools', 'lights'],
['industry', 'commerce', 'and', 'business'],
['food'],
['art'],
['furniture', 'bedroom'],
['books', 'general', 'interest'],
['construction', 'tools', 'safety'],
['fashion', 'underwear', 'beach'],
['fashion', 'sport'],
['signaling', 'and', 'security'],
['computers'],
['christmas', 'supplies'],
['fashio', 'female', 'clothing'],
['home', 'appliances', '2'],
['books', 'imported'],
['drinks'],
['cine', 'photo'],
['la', 'cuisine'],
['music'],
['home', 'comfort', '2'],
['small', 'appliances', 'home', 'oven', 'and', 'coffee'],
['cds', 'dvds', 'musicals'],
['dvds', 'blu', 'ray'],
['flowers'],
['arts', 'and', 'craftmanship'],
['diapers', 'and', 'hygiene'],
['fashion', 'childrens', 'clothes'],
['security', 'and', 'services'],
['portateis', 'cozinha', 'e', 'preparadores', 'de', 'alimentos'],
['pc', 'gamer'],
['pcs'],
['other']]
2 | GloVe Embeddings
2.1 | Indexer Object and Embedding initialization
- An Indexer Object is created to map words to a dictionary
read_wrod_embeddings
is used to interface between the stored GloVe-50d embeddings in the fileglove.6B.50d-relativized.txt
and the dictionary protocal defined previously in the Indexer Object- Finally an indexer object,
word_indexer
and the embedded dictionaryembeddings
object is returned
class Indexer(object):
"""
Bijection between objects and integers starting at 0. Useful for mapping
labels, features, etc. into coordinates of a vector space.
Attributes:
@objs_to_ints
@ints_to_objs
"""
def __init__(self):
self.objs_to_ints = {}
self.ints_to_objs = {}
def __repr__(self):
return str([str(self.get_object(i)) for i in range(0, len(self))])
def __str__(self):
return self.__repr__()
def __len__(self):
return len(self.objs_to_ints)
def get_object(self, index):
"""
@param index: integer index to look up
@return: Returns the object corresponding to the particular index or None if not found
"""
if (index not in self.ints_to_objs):
return None
else:
return self.ints_to_objs[index]
def contains(self, object):
"""
@param object: object to look up
@return: Returns True if it is in the Indexer, False otherwise
"""
return self.index_of(object) != -1
def index_of(self, object):
"""
@param object: object to look up
@return: Returns -1 if the object isn't present, index otherwise
"""
if (object not in self.objs_to_ints):
return -1
else:
return self.objs_to_ints[object]
def add_and_get_index(self, object, add=True):
"""
Adds the object to the index if it isn't present, always returns a nonnegative index
@param object: object to look up or add
@param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
@return: The index of the object
"""
if not add:
return self.index_of(object)
if (object not in self.objs_to_ints):
new_idx = len(self.objs_to_ints)
self.objs_to_ints[object] = new_idx
self.ints_to_objs[new_idx] = object
return self.objs_to_ints[object]
def read_word_embeddings(embeddings_file: str):
"""
Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
word embedding files.
@param embeddings_file: path to the file containing embeddings
@return: WordEmbeddings object reflecting the words and their embeddings
"""
f = open(embeddings_file)
word_indexer = Indexer()
vectors = []
# Make position 0 a PAD token, which can be useful if you
word_indexer.add_and_get_index("PAD")
# Make position 1 the UNK token
word_indexer.add_and_get_index("UNK")
for line in f:
if line.strip() != "":
space_idx = line.find(' ')
word = line[:space_idx]
numbers = line[space_idx+1:]
float_numbers = [float(number_str) for number_str in numbers.split()]
vector = np.array(float_numbers)
word_indexer.add_and_get_index(word)
# Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
# of the file to see what the embedding dim is
if len(vectors) == 0:
vectors.append(np.zeros(vector.shape[0]))
vectors.append(np.zeros(vector.shape[0]))
vectors.append(vector)
f.close()
print("Read in " + repr(len(word_indexer)) + " vectors of size " + repr(vectors[0].shape[0]))
# Turn vectors into a 2-D numpy array
return word_indexer, np.array(vectors)
word_indexer, embeddings = read_word_embeddings('glove/glove.6B.50d-relativized.txt')
Read in 14923 vectors of size 50
2.2 | Embedding Concatenation
get_embedding
is a functon defined to obtain the embedding for any product category, meant to deal with multiple words describing any category- Two ways of dealing with multi-word properties of product categories are included:
mean
where the mean of embedding values for each word in the the GloVe-50d dictionary is found, for each word across all 50 dimensionssum
where the sum of embedding values for each word in the GloVe-50d dictionary is found, for each word, summed across all 50 dimensions
- The embedding for each product category is returned based on the previously determined methodology for word handling
def get_embedding(word_indexer, embeddings, product_categories, reduce='mean'):
"""
@param word_indexer returns the index of the word in the embedding
@param embeddings the embeddings matrix of all words in GloVe-50d
@param product_categories the categories of all products represented in a [N x 1] array
"""
category_embeddings = {}
for category, category_list in zip(product_categories, product_categories_list):
category_embed = []
for word in category_list:
index_word = word_indexer.index_of(word)
embed = embeddings[index_word]
category_embed.append(embed)
if reduce == 'mean':
embed = np.mean(category_embed, axis=0)
if reduce == 'sum':
embed = np.sum(category_embed, axis=0)
category_embeddings[category] = embed
return category_embeddings
embeddings = get_embedding(word_indexer, embeddings, product_categories)
embeddings
{'perfumery': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'artes': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'sport_leisure': array([-6.5777500e-01, 2.3660000e-01, -5.2690000e-01, -1.0300000e-03,
-3.7766500e-01, -2.4336400e-01, 1.1789000e-01, -2.6644000e-01,
1.3412000e-02, 2.1341800e-01, 5.8577000e-01, 6.4220000e-02,
-8.8555000e-02, 1.7286500e-01, 2.5672500e-01, -3.4311500e-01,
2.7832000e-01, 1.2754050e+00, 1.1235000e-02, -8.0732000e-01,
-6.4122500e-01, 1.1676450e-01, -2.9102100e-01, 4.8428000e-01,
1.5141000e-01, -1.3723500e-01, -6.3680000e-01, 3.7784500e-01,
1.5137115e-01, 3.4792000e-01, 5.6390000e-01, 6.3726550e-01,
-2.4579550e-01, -9.4106500e-02, -2.6462000e-01, 4.5140000e-01,
-4.5003500e-02, 1.4046500e-01, -5.2468500e-01, -2.7534950e-01,
1.5362100e-01, -5.9103000e-01, 6.8940000e-02, 4.5214500e-01,
2.8340550e-01, -3.7257450e-01, -1.5898500e-01, 2.5181000e-01,
5.0931500e-01, -1.2091640e-01]),
'babies': array([ 0.69898 , -0.056312, 0.34178 , -1.4674 , 0.54603 , 1.7752 ,
-0.57618 , -0.60623 , 0.83499 , 0.42334 , 0.50787 , -0.57778 ,
1.0339 , 0.27791 , 1.4515 , 0.58806 , -0.96275 , 0.33018 ,
0.45507 , -0.24377 , -0.15741 , 0.95016 , 0.71083 , 0.23539 ,
0.66137 , -0.61362 , -0.50664 , -0.24646 , 0.32122 , -0.50601 ,
1.5551 , 1.1833 , -0.19435 , 0.40841 , 0.50837 , 0.11851 ,
0.82978 , -0.22482 , 0.84309 , -0.27989 , -1.2027 , -0.50551 ,
0.49102 , 0.5024 , 1.5571 , -0.025314, -0.032675, -0.68783 ,
-0.072284, -0.22732 ]),
'domestic_utilities': array([-0.2169885 , -0.259665 , -0.089036 , -0.232085 , 0.02123 ,
0.02266 , 0.303745 , -0.30865 , 0.304807 , 0.24873115,
0.27609 , 0.182715 , 0.44881 , 0.297305 , 0.20334 ,
-0.2240425 , 0.0246 , 0.3544 , 0.21105 , -0.38053 ,
0.289055 , -0.3395355 , -0.114346 , -0.12142 , -0.0075 ,
-0.118635 , 0.00909 , 0.02115 , 0.324625 , 0.756815 ,
1.0631 , 0.6046155 , 0.4585545 , -0.0684765 , -0.31949 ,
0.4114665 , -0.3848335 , -0.1004675 , -0.565625 , -0.2515545 ,
-0.339595 , -0.245639 , 0.28604 , 0.173715 , 0.44828 ,
-0.3250159 , -0.2789 , 0.435085 , 0.45348 , 0.0225586 ]),
'musical instruments': array([-0.06468 , 0.322055 , -0.963365 , -0.114635 , -0.177675 ,
0.54965 , -0.1838395 , -0.4549 , -0.64644 , 1.3871 ,
0.57586 , 0.94313 , 0.151145 , 1.07977 , -0.49424 ,
-0.380291 , -0.17956 , 0.1215325 , -0.297625 , -0.85511 ,
0.865405 , 0.562785 , -0.186045 , -0.499275 , 0.152454 ,
-0.0582182 , -1.35372 , -0.50429 , -0.154655 , -0.653355 ,
2.9801 , -0.507645 , 0.638315 , -0.33097815, 0.5617 ,
0.35252 , 0.186936 , 0.0288775 , -0.228075 , 0.122783 ,
0.05871 , 0.03846 , 0.147365 , -0.122755 , -0.14569 ,
0.339465 , 0.54428 , -0.093315 , -1.1327 , 0.159625 ]),
'cool_stuff': array([-4.932950e-01, 2.035540e-01, -8.313395e-02, -4.249600e-01,
9.794000e-02, -4.397210e-01, -3.272700e-01, -2.336160e-01,
-3.763900e-01, 6.511450e-01, -5.402300e-01, 3.683300e-01,
-1.061500e-02, 6.245650e-01, 5.724015e-01, 5.704455e-01,
3.654400e-01, 4.361400e-01, -1.085555e-01, -1.033350e+00,
-5.114250e-02, 5.775950e-01, 8.279250e-01, 3.179500e-01,
4.330600e-01, -8.627450e-01, -8.960600e-01, 1.115915e+00,
1.720000e+00, -2.354450e-01, 2.476200e+00, 2.326425e-01,
-7.899500e-03, 1.246350e-01, -1.432040e-01, 5.095900e-01,
-3.377350e-01, 3.771100e-01, 3.613050e-01, -7.046750e-01,
1.961800e-01, 3.419200e-01, -9.439450e-02, 3.296600e-01,
4.638600e-01, 4.373350e-01, 4.684150e-01, -4.642925e-01,
2.028000e-03, 6.441700e-01]),
'furniture_decoration': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'home appliances': array([-0.11147 , 0.160835 , 0.01424 , -0.297908 , 0.231035 ,
-0.46969 , -0.473055 , 0.333195 , -0.0387957 , 0.258926 ,
-0.216425 , -0.15653 , 0.26442 , 0.047085 , 0.04186 ,
-0.221237 , 0.024125 , 0.23801655, 0.0201 , 0.02417 ,
-0.166215 , 0.2318245 , -0.276051 , 0.153675 , 0.016345 ,
-0.324785 , -0.048915 , 0.14006 , 0.401985 , -0.12704 ,
1.02255 , 0.4103305 , 0.1114745 , 0.024657 , 0.09831 ,
0.6256 , 0.1077315 , 0.135195 , 0.085595 , -0.2381195 ,
0.013745 , -0.16949 , -0.228925 , 0.31469 , 0.319578 ,
-0.04022 , -0.16518 , -0.316485 , 0.24828 , -0.0955514 ]),
'toys': array([ 0.11738 , -0.78864 , 0.20307 , 0.40609 , 0.53572 , 0.66853 ,
-1.0303 , -2.0666 , 0.67746 , 0.41301 , -0.036541, 0.57572 ,
0.2052 , -0.13067 , 1.1972 , 0.69488 , -0.7459 , 1.0515 ,
0.22374 , -1.0436 , 1.1563 , 0.12272 , -0.091888, -0.070431,
-0.35183 , -0.37046 , -1.4255 , -0.078561, 0.716 , -1.3362 ,
1.3186 , 0.995 , 0.52411 , 0.38252 , 0.73561 , 1.1922 ,
0.30399 , -0.19526 , -0.036646, -0.48951 , -0.36473 , 0.2109 ,
0.16709 , 0.18673 , 1.3641 , 1.1111 , -0.17835 , -0.31437 ,
0.12497 , -0.14976 ]),
'bed table bath': array([ 0.121282 , 1.07551 , -0.50220667, -0.33427333, 0.37935 ,
-0.153277 , -0.141247 , 0.02063833, -0.05749 , -0.60773667,
-0.4423 , 0.2403 , -0.190591 , 0.27271 , 0.21277667,
0.86095667, 0.09901333, 0.06610467, -0.13987933, -0.73049667,
0.68601 , 0.55938333, 0.48579333, 0.25348 , -0.37527667,
-0.423359 , -0.130161 , 0.77310667, 0.42638333, 0.09753433,
2.26126667, 0.27597467, -0.23237503, 0.67459667, 0.37090333,
0.82384333, 0.57042 , 0.48660667, 0.932469 , -0.33592333,
0.46510333, -0.20701167, -0.30365333, 0.71547 , 0.31835667,
0.42810333, 0.18425667, -1.03925233, 0.38622333, -0.54126 ]),
'construction_tools_security': array([ 0.50816667, -0.15552927, 0.61994 , 0.21229467, -0.02190367,
-0.32106667, -0.30894 , -0.80907333, 0.16325333, -0.592376 ,
-0.16282 , -0.10930333, -0.347921 , -0.05749967, -0.32407333,
0.31827333, 0.13833333, 0.25797333, 0.58916667, -0.494492 ,
0.94635 , -0.05304667, -0.88858 , -0.78261 , -0.048928 ,
-1.00035333, 0.02441667, -0.399411 , 0.35588333, 0.09051667,
3.3164 , -0.50588333, -0.39591667, -0.65001233, 0.01619 ,
0.8768 , 0.254316 , 0.20291667, -0.09791267, 0.423946 ,
-0.13481333, -0.37350333, 0.4858 , 0.20931667, 0.53948267,
-0.29450233, -0.04733667, 0.60774667, 0.18903 , -0.15894333]),
'informatica_accessories': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'beauty_health': array([ 0.149462 , 0.837515 , -0.658989 , 0.11762 , 0.156495 ,
0.311425 , -0.2985085, -0.72018 , 1.114425 , 0.224965 ,
0.18148 , -0.1437515, 0.36231 , -0.316605 , 0.29906 ,
0.187535 , -0.318833 , 0.19092 , 0.534318 , -0.15734 ,
-0.095259 , 1.056855 , -0.295065 , -0.079285 , 0.14422 ,
-1.22806 , -1.06188 , -0.15872 , -0.05257 , 0.38244 ,
2.6115 , 0.681295 , -0.01553 , -0.803845 , -0.079915 ,
0.240563 , -0.286601 , 0.450905 , 0.700525 , -0.472489 ,
-0.326875 , 0.212085 , 0.4923 , 0.25222 , 0.270245 ,
0.37235 , -0.448995 , -0.188735 , 0.56276 , 0.34234 ]),
'bags_accessories': array([-0.09753 , -0.21828 , 0.39673 , -0.541435 , 0.28565 ,
-0.250009 , 0.16389 , -0.432615 , 0.587842 , -0.11338 ,
0.13238 , 0.034195 , 0.52179 , 0.371215 , 0.406465 ,
0.015975 , -0.30545 , 0.43989 , 0.361695 , -1.02972 ,
0.11384 , -0.02404 , 0.438889 , -0.109875 , -0.292865 ,
0.148435 , -0.41799 , 0.785775 , 0.695215 , -0.208865 ,
0.1957 , 0.6654155, -0.2863305, 0.4026785, 0.102555 ,
1.22624 , 0.2838865, 0.30733 , 0.079865 , 0.0776355,
0.35376 , 0.099455 , 0.17175 , 0.439675 , 0.51477 ,
-0.338477 , -0.051525 , -0.07325 , 0.385935 , -0.5949414]),
'garden_tools': array([-0.13122 , -0.45077 , 0.20832 , -0.15525 , -0.0177905,
-0.273765 , 0.207975 , -0.589265 , 0.100587 , 0.265581 ,
0.0123335, 0.303705 , 0.4274135, 0.2284005, -0.441685 ,
-0.167755 , 0.30533 , 0.4014 , 0.67399 , -0.67052 ,
0.135475 , -0.0920255, -0.201911 , -0.296385 , 0.375905 ,
0.38115 , -0.546515 , -0.447175 , 0.41337 , -0.095075 ,
0.8753 , -0.3589695, -0.2505255, -0.3293365, -0.10512 ,
1.16314 , -0.0027195, 0.341125 , -0.140954 , -0.0208605,
0.459025 , -0.554825 , -0.05076 , 0.85532 , 1.007625 ,
-0.3457835, 0.583475 , 0.42408 , 0.202085 , -0.1239964]),
'furniture_office': array([-0.100575 , -0.0411355 , 0.043105 , -0.209865 , 0.18825 ,
-0.59084 , 0.02806 , -0.02936 , -0.0272195 , -0.39515 ,
-0.306055 , -0.054165 , 0.205645 , 0.2346751 , 0.092645 ,
-0.11909 , -0.277255 , 0.274721 , 0.58935 , -0.1465255 ,
0.294605 , 0.0838895 , -0.231836 , -0.210795 , -0.24414 ,
-0.562935 , -0.1818305 , 0.36765 , -0.19358 , 0.27433 ,
0.8427 , -0.0921145 , -0.2660805 , -0.5448265 , -0.340315 ,
0.51402 , 0.2321565 , 0.036925 , 0.131865 , -0.1284995 ,
-0.039955 , 0.025175 , -0.247405 , 0.25581025, -0.044475 ,
-0.40652 , -0.442715 , 0.0650475 , 0.508945 , 0.2105486 ]),
'automotive': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'electronic': array([-6.6594e-01, -6.5510e-01, 9.2148e-01, 6.8301e-01, -4.7822e-01,
-2.8030e-01, -5.3848e-01, -1.0404e+00, 8.6666e-01, 9.4106e-01,
6.8266e-01, 1.6785e-01, 2.7838e-01, 1.1826e+00, -4.1586e-01,
-5.3318e-02, -1.2734e+00, -2.2728e-01, 8.7516e-02, -9.1505e-01,
1.7087e+00, -1.1551e-01, -7.6449e-01, 2.7068e-02, -2.9328e-02,
-7.2308e-01, -7.4972e-01, -7.2909e-01, 4.0463e-01, -5.0634e-01,
2.9845e+00, -5.8706e-01, 3.0288e-01, -5.0952e-01, -3.2629e-01,
4.3184e-01, 2.9218e-01, -2.9649e-01, -1.7555e-03, 1.2518e-01,
1.0972e+00, 1.6128e-01, 2.2262e-01, 6.7454e-01, 1.5689e-01,
2.6961e-01, 7.8616e-01, 3.6450e-01, -4.2447e-01, -5.8735e-02]),
'fashion_shoes': array([-0.529271 , 0.187405 , -0.706595 , -0.150854 , 0.257785 ,
-0.062465 , -0.64265 , -1.31045 , 0.1302085 , 0.121555 ,
0.15193675, -0.2591685 , -0.87015 , 0.13515 , 0.622815 ,
0.30194 , -0.52321 , 0.184355 , 0.289615 , -1.27407 ,
0.267221 , 0.79496 , -0.58884 , 0.123875 , -0.49169 ,
-0.965705 , -1.28342 , 0.82245 , 0.54177 , -0.765955 ,
1.8377 , 0.77242 , 0.12483 , 0.23106 , -0.020515 ,
0.487787 , -0.031645 , 0.97589 , -0.577765 , -0.24196 ,
0.33837 , 0.358545 , 0.41685 , 0.10506 , 0.753915 ,
-0.498315 , 0.04316 , -0.85085 , 0.0436 , 0.16844 ]),
'telephony': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'stationary store': array([ 0.431825 , -0.159595 , 0.7213 , 0.0359543, 0.5905305,
-0.276996 , -0.849492 , -0.3911 , 0.629875 , -0.1776875,
0.5078325, -0.59475 , -0.20207 , 0.137035 , 0.00635 ,
0.40943 , -0.61595 , 0.677255 , -0.25467 , -1.192875 ,
0.8558665, -0.411145 , -0.565025 , 0.826405 , -0.09058 ,
-0.349225 , -0.38166 , 0.59205 , 1.1161 , -0.07926 ,
1.7939 , -0.210345 , 0.169255 , -0.096005 , 0.29153 ,
0.1508485, 0.194135 , 0.55134 , 0.201635 , 0.543336 ,
0.490335 , -0.2112085, -0.356055 , 0.361025 , -0.00265 ,
-0.22146 , 0.32249 , -0.780135 , 0.39682 , -0.009155 ]),
'fashion_bolsas_e_acessorios': array([-0.3612355 , -0.0531225 , -0.496455 , -0.4822595 , -0.00221 ,
-0.3775225 , 0.4505125 , -0.1233975 , -0.08142025, 0.5212425 ,
0.06148088, 0.0285875 , 0.3185425 , 0.4761475 , -0.11933 ,
-0.41105 , 0.146635 , 0.276585 , 0.4649475 , -0.38809 ,
-0.2147625 , 0.12867925, -0.1729265 , 0.103575 , 0.078905 ,
0.530345 , -0.7632925 , 0.3161625 , 0.29196 , 0.237145 ,
-0.360725 , 0.17938325, 0.11933925, -0.11785975, -0.3642725 ,
0.566631 , -0.04145275, 0.16126 , -0.4949825 , -0.13645425,
0.2658775 , -0.3131925 , -0.1265925 , 0.33689 , 0.6194475 ,
-0.5635775 , 0.15796 , -0.0090975 , 0.30098 , 0.2050404 ]),
'pcs': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'house_construction': array([ 0.70555 , 0.293185 , 0.179376 , -0.08316 , 0.05319 ,
0.189115 , -0.89585 , -0.2394795 , -0.44347 , -0.729345 ,
-0.9164635 , -0.29398 , -0.1271705 , -0.20140725, -0.0930295 ,
0.31127 , 0.028011 , 0.237485 , 0.254165 , -0.539355 ,
1.42785 , -0.197095 , -1.077485 , -0.758905 , -0.204447 ,
-1.257615 , -0.097675 , 0.124841 , -0.017775 , 0.153878 ,
3.13085 , -0.98483 , -0.22145 , -0.2723735 , 0.171395 ,
0.0940355 , 0.41796 , 0.220105 , 0.33042 , 0.07027 ,
-0.579885 , -0.345695 , -0.00575 , -0.055565 , 0.007649 ,
0.01957 , -0.75081 , -0.545575 , 0.154991 , -0.043825 ]),
'watches_gifts': array([ 0.13938 , 0.490805 , -0.10947 , 0.016295 , 0.826165 ,
-0.07303 , -0.56479 , -0.260535 , 0.559345 , 0.360701 ,
-0.179925 , 0.609995 , 0.801685 , -0.12088 , 1.1018 ,
0.113595 , -0.871335 , 0.008045 , 0.19502 , -0.88376 ,
0.995565 , 0.2234605, -0.1203995, -0.45073 , 0.03505 ,
-0.87461 , -0.81466 , -0.01929 , 0.48275 , -0.923825 ,
1.1388 , 1.145715 , 0.375925 , 0.1000245, 0.015275 ,
0.761995 , -0.1934265, 0.38571 , -0.425525 , -0.185705 ,
0.298155 , 0.665945 , 0.1537 , 0.508755 , 0.110155 ,
-0.35469 , -0.365415 , -0.245515 , -0.395905 , -0.26568 ]),
'construction_tools_construction': array([ 6.04326667e-01, -5.40133333e-02, 4.78586667e-01, 2.83800000e-01,
-4.04690333e-01, 3.73033333e-02, -6.85740000e-01, -8.43093333e-01,
-1.14010000e-01, -1.70072667e-01, -7.47566667e-03, -1.00790000e-01,
-1.81647667e-01, -2.78849667e-01, -3.55140000e-01, 2.03773333e-01,
1.87083333e-01, 5.66763333e-01, 3.55830000e-01, -6.04233333e-01,
1.33764667e+00, -5.36506667e-01, -1.31845000e+00, -1.02777667e+00,
1.58164000e-01, -5.60030000e-01, -8.85000000e-02, -4.28098667e-01,
4.28930000e-01, 8.98000000e-03, 3.32730000e+00, -9.14663333e-01,
-2.63333333e-04, -2.79684667e-01, 9.72200000e-02, 7.08906667e-01,
3.31869333e-01, 6.13723333e-01, 2.10327333e-01, 5.75602667e-01,
-2.77223333e-01, -8.02490000e-01, 3.61586667e-01, 2.26533333e-01,
4.81825333e-01, -3.60482333e-01, -7.28600000e-02, 1.81796667e-01,
2.17276667e-01, -2.75956667e-01]),
'pet_shop': array([ 0.383305 , -0.22558 , -0.1947675 , -0.02225955, 0.552615 ,
0.13605 , -1.42445 , -0.9175 , 0.82867 , 0.1530525 ,
-0.018925 , 0.265945 , 0.60104 , 0.45376 , 0.504435 ,
0.33318 , -0.437065 , 0.640445 , 0.09386 , -0.64959 ,
0.901288 , 0.2274495 , -0.5736005 , 0.627565 , -0.20507 ,
-1.05225 , -0.846905 , 0.1250828 , 0.569395 , -0.734295 ,
1.6631 , 0.17512 , -0.3570085 , 0.62313 , 0.11078 ,
0.520045 , 0.22995 , 0.216055 , 0.75598 , -0.06704 ,
0.154995 , 0.1381225 , -0.166715 , 0.671815 , 1.207155 ,
0.16389 , -0.39704 , -0.644335 , 0.89743 , 0.186325 ]),
'small appliances': array([ 3.4297000e-01, 3.9515000e-02, -8.8830000e-02, -4.2744500e-01,
3.0806500e-01, 6.3250000e-02, -6.7200000e-03, -1.9590000e-02,
-5.8000000e-05, -2.2235000e-02, -4.0970000e-02, -8.0975000e-02,
8.6283000e-01, 3.9184500e-01, -4.4992500e-01, -2.5332900e-01,
4.1254000e-01, 5.3558000e-01, 4.9550000e-03, -6.7982000e-01,
-1.7003000e-01, -3.7142050e-01, -4.6756000e-02, 1.7112500e-01,
-8.8070000e-02, -7.2185000e-02, -4.1208000e-01, 4.2770500e-01,
3.4677500e-01, 4.9295000e-02, 1.3307000e+00, -1.2948950e-01,
2.0801950e-01, 4.3545000e-02, -9.2545000e-02, 6.1857500e-01,
-1.7431850e-01, 8.0300000e-03, -1.1555500e-01, -6.6786000e-02,
1.0083015e-01, -1.8001000e-01, -1.2839150e-01, 4.0629500e-01,
5.5394500e-01, -3.2650805e-01, -1.0066500e-01, -2.3265000e-01,
1.1707450e-01, -2.2713140e-01]),
'agro_industry_and_trade': array([-3.6232000e-01, -1.6120825e-01, -3.0137925e-01, 2.7522175e-01,
3.0500000e-04, -1.9281200e-02, -4.3572750e-01, -5.8117250e-01,
1.4762350e-01, 1.8315050e-01, 2.2254500e-01, 3.1581500e-01,
-2.4672750e-02, 2.6693750e-01, 1.0409750e-01, -8.1837500e-03,
2.3921475e-01, 2.3425000e-02, 3.2642500e-01, -4.5122400e-01,
6.0444750e-01, -1.7562525e-01, -1.8230300e-01, -3.1608150e-01,
-1.7505000e-02, -7.9666750e-01, -2.9927500e-02, -6.0552500e-02,
2.2260000e-01, 1.8546000e-01, 2.4350500e+00, 1.0978275e-01,
2.2210275e-01, -6.2197750e-02, -4.0215250e-01, -5.1220000e-02,
-5.3526675e-01, 1.9079650e-01, -7.6040000e-02, 2.1851750e-01,
-4.5848575e-01, -3.6085000e-02, 3.8913000e-01, -5.1767000e-02,
4.7119500e-01, -4.3097500e-02, -4.2938000e-01, 2.7493000e-01,
1.1408500e-01, -5.4280700e-02]),
'in': array([ 0.33042 , 0.24995 , -0.60874 , 0.10923 , 0.036372 ,
0.151 , -0.55083 , -0.074239 , -0.092307 , -0.32821 ,
0.09598 , -0.82269 , -0.36717 , -0.67009 , 0.42909 ,
0.016496 , -0.23573 , 0.12864 , -1.0953 , 0.43334 ,
0.57067 , -0.1036 , 0.20422 , 0.078308 , -0.42795 ,
-1.7984 , -0.27865 , 0.11954 , -0.12689 , 0.031744 ,
3.8631 , -0.17786 , -0.082434 , -0.62698 , 0.26497 ,
-0.057185 , -0.073521 , 0.46103 , 0.30862 , 0.12498 ,
-0.48609 , -0.0080272, 0.031184 , -0.36576 , -0.42699 ,
0.42164 , -0.11666 , -0.50703 , -0.027273 , -0.53285 ]),
'living room furniture': array([ 1.67513333e-01, 4.02106667e-01, -7.22100000e-02, -7.26840000e-01,
7.17410000e-01, -2.41123333e-01, -2.67453333e-01, -2.62083333e-01,
-1.82842000e-01, 8.89833333e-02, -1.13746667e-01, -2.46113333e-01,
6.51142667e-01, 3.34633333e-01, 4.22633333e-01, 1.46620000e-01,
1.24803333e-01, 3.68426667e-01, 2.28036667e-01, -2.87353333e-01,
3.40700000e-02, 7.51323000e-01, -1.59634000e-01, 8.91543333e-02,
2.33490000e-01, -2.35480000e-01, -3.49280000e-01, 2.41256667e-01,
4.24886667e-01, 3.46933333e-02, 1.59026667e+00, 2.18807000e-01,
-4.28693333e-02, -3.34637667e-01, -3.76666667e-04, 6.62646667e-01,
2.05654333e-01, 3.62533333e-02, 3.48250000e-01, -2.08463000e-01,
-1.81054000e-01, -8.17216667e-02, 1.29733333e-01, 6.26040000e-01,
1.02564667e-01, -1.90523333e-01, -1.45210000e-01, -6.81410000e-01,
9.24056667e-02, -1.19250933e-01]),
'signaling_and_security': array([ 1.11156667e-01, 9.64066667e-04, 9.95133333e-02, -1.89446333e-01,
1.71883333e-01, -2.29106667e-01, 2.11993333e-01, -2.31163333e-01,
3.70213333e-02, -3.91980000e-01, -3.18066667e-02, 5.87833333e-02,
-1.59613333e-01, 1.03470000e-01, -1.95366667e-01, 2.85850000e-02,
1.07656333e-01, 1.13783333e-01, 3.79966667e-01, -2.66565333e-01,
1.08866667e-01, 3.58706333e-01, 3.67493333e-02, -2.50102000e-01,
-1.60673333e-01, -8.17556667e-01, 2.05733333e-02, 3.56850000e-02,
1.54873000e-01, 1.41806667e-01, 2.04663333e+00, 2.11610333e-01,
-2.49500333e-01, -5.60964333e-01, -7.81233333e-02, 6.12480000e-01,
1.91433333e-03, -1.82360000e-01, -3.06660000e-01, 2.23910333e-01,
-6.81943333e-02, 6.15733333e-02, 2.96260000e-01, -1.25100000e-02,
3.70623333e-01, -2.83243333e-01, -2.56300000e-01, 2.52823333e-01,
7.58333333e-02, -1.04514267e-01]),
'air conditioning': array([ 0.08439 , 0.046005 , 0.26098 , -0.27408 , -0.793375 ,
-0.803605 , -0.1245665, 0.010855 , 0.4847655, -0.353585 ,
0.91126 , 0.1183835, -0.063325 , 0.872935 , -0.636735 ,
0.54369 , -0.73477 , 0.976155 , -0.937524 , -1.147215 ,
0.059735 , 0.38916 , -0.273395 , -0.6447 , -0.405955 ,
-1.05922 , -0.005155 , 0.604635 , 1.055195 , 0.8969 ,
2.568 , 0.68065 , -0.350425 , -0.405875 , 0.188045 ,
0.6840155, 0.685145 , 0.197325 , 0.460865 , 0.616075 ,
0.3456 , 0.371473 , 0.03534 , 0.30622 , -0.018955 ,
0.251155 , 0.197905 , 0.02026 , -0.889225 , 0.85391 ]),
'consoles_games': array([-0.670455 , 0.318735 , -0.084189 , 0.50554 , -0.189155 ,
-0.334235 , -0.06092 , -0.120755 , -0.273838 , 0.441805 ,
0.054707 , 0.018565 , -0.313205 , 0.294875 , 0.529165 ,
-0.654975 , 0.08529 , 0.45546 , -0.714095 , -0.091431 ,
-0.54849 , -0.1767905 , 0.284039 , 0.01622 , 0.148317 ,
0.132665 , -0.079865 , -0.1243 , 0.354785 , -0.435235 ,
1.04355 , 0.8325155 , 0.0698745 , -0.1962215 , -0.136555 ,
0.71047 , 0.4870315 , 0.3173 , -0.665315 , -0.1829195 ,
-0.135135 , -0.66811 , -0.17432 , 0.580815 , 0.115815 ,
0.16498 , 0.0049695 , 0.1625 , 0.14231545, -0.0138379 ]),
'general_interest_books': array([-0.13378667, 0.40633 , -0.180478 , -0.32258 , 0.57698667,
-0.15125333, -0.81235333, -0.78724667, -0.17347333, -0.18801 ,
-0.10828667, 0.84404333, -0.32388033, -0.50571 , 0.66861067,
-0.10638333, -0.25498667, -0.40634 , 0.00961433, -0.27365133,
0.75540633, -0.20779667, 0.14485397, -0.28896333, -0.15458667,
-1.32330667, -0.607391 , -0.59106667, -0.26451667, 0.41909667,
2.96093333, -0.07119 , 0.06087667, -0.59353333, -0.0925 ,
-0.23531667, -0.59946833, 0.08378333, 0.00778733, -0.10507333,
0.04876 , 0.14108 , 0.00764667, 0.14884 , -0.19461133,
0.05654733, -0.11801333, 0.47503367, -0.10037333, 0.214126 ]),
'construction_tools_tools': array([ 0.39892333, -0.40918667, 0.56638333, 0.30366 , -0.16770067,
-0.04139333, -0.55339 , -1.12864667, 0.08304 , -0.06945533,
-0.03272433, 0.13751 , -0.05340533, -0.14626933, -0.45627 ,
0.19054667, 0.24414667, 0.44537667, 0.575 , -0.84766667,
1.04299333, -0.32336333, -0.8973 , -0.75705333, 0.328612 ,
-0.38793 , -0.39672 , -0.81054933, 0.48087 , -0.2697 ,
3.1354 , -0.85226667, -0.26795667, -0.48275233, 0.1343 ,
1.15210333, 0.13191867, 0.72079667, 0.15491467, 0.30319533,
0.21638333, -0.68201 , 0.23959333, 0.71031667, 0.92521267,
-0.19621467, 0.49402 , 0.42694333, 0.16449333, -0.26408333]),
'fashion_underwear_e_moda_praia': array([-0.4035864 , -0.074576 , -0.418086 , -0.3687174 , 0.05163 ,
-0.194718 , 0.3565326 , -0.331838 , 0.0718638 , 0.35747 ,
0.1141707 , -0.073368 , 0.101738 , 0.3884804 , 0.048456 ,
-0.15035 , -0.142352 , 0.329326 , 0.495374 , -0.508146 ,
-0.11091 , 0.2938714 , -0.0429112 , 0.08110864, -0.133962 ,
0.190236 , -0.884034 , 0.48503 , 0.476228 , 0.004268 ,
-0.240858 , 0.2754786 , -0.0439746 , -0.0122178 , -0.208024 ,
0.6074088 , -0.04877 , 0.298082 , -0.523692 , -0.1423354 ,
0.27784 , -0.189706 , -0.001552 , 0.313756 , 0.645318 ,
-0.640938 , 0.166912 , -0.243998 , 0.271598 , 0.07568232]),
'fashion_clothes_men': array([-0.20766067, 0.18115 , -0.58055 , -0.36946267, 0.71611 ,
-0.06227667, -0.480974 , -0.56409 , -0.18878433, -0.17501 ,
0.1055745 , -0.55299 , -0.601128 , -0.04001 , 0.75262667,
0.03583333, -0.38519333, 0.06401 , 0.00884333, -1.02360333,
0.10532333, 1.05513667, -0.07654333, 0.40878733, -0.4867 ,
-1.12438667, -1.28289333, 0.28985667, 0.52306667, -0.48607333,
2.29503333, 0.7971 , -0.01128033, -0.1677 , 0.21522667,
0.67615133, -0.14004467, 0.78054667, -0.49585867, -0.15393933,
0.16505333, 0.25155667, 0.61591 , 0.17914333, 0.84348333,
-0.51744 , -0.09429 , -0.87343 , 0.09008 , -0.10341667]),
'furniture_cozinha_area_de_servico_jantar_e_jardim': array([-0.12591 , 0.05804125, -0.3714275 , -0.45058238, -0.17667313,
-0.6250375 , 0.4318925 , 0.2268 , -0.097417 , 0.32719125,
0.15450375, 0.0841875 , 0.55648 , 0.065975 , -0.3158175 ,
-0.53394125, 0.372565 , 0.294245 , 0.2417575 , -0.24151 ,
-0.4950775 , -0.04474862, 0.01052183, 0.04897125, 0.0280925 ,
0.58375375, -0.39934212, 0.327625 , 0.249945 , 0.282065 ,
-0.1091875 , -0.13682175, -0.11248325, -0.11242225, -0.23426225,
0.45630125, 0.04041513, -0.06066375, -0.14173625, 0.15078825,
0.22956875, -0.37743613, 0.01378625, 0.17155463, 0.274235 ,
-0.62712375, -0.06439375, -0.1451625 , 0.444225 , 0.0523804 ]),
'industry_comercio_e_nocios': array([-0.4742575 , -0.2723675 , -0.17437425, -0.232995 , -0.17193 ,
-0.3223387 , 0.3587175 , -0.0484675 , 0.1350155 , 0.4752125 ,
0.1572025 , 0.245885 , 0.56979725, 0.47852 , -0.156155 ,
-0.3005925 , 0.3090825 , 0.508665 , 0.55089 , -0.430885 ,
0.0197325 , -0.15864075, -0.1362815 , -0.1735025 , 0.1958575 ,
0.3657475 , -0.5008075 , 0.1346775 , 0.21376025, 0.449445 ,
-0.014425 , 0.00752575, 0.19361925, 0.00729575, -0.4523525 ,
0.47392 , -0.19727275, 0.0138125 , -0.153285 , -0.0763395 ,
-0.05682 , -0.3689725 , -0.0889 , 0.383123 , 0.659645 ,
-0.4358525 , -0.1451525 , 0.265465 , 0.2715725 , 0.0709879 ]),
'fixed_telephony': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'construction_tools_lighting': array([ 0.30365733, 0.00852667, 0.36363 , 0.37041 , -0.29092033,
-0.04499667, -0.4140118 , -0.97149 , -0.06964 , 0.04465067,
0.05495333, 0.05082 , -0.008051 , 0.047657 , -0.418 ,
0.39159333, 0.00890667, 0.47169667, 0.20268333, -1.24886667,
1.15448 , -0.11887333, -1.19068333, -0.97411 , 0.31914867,
-0.21629 , -0.15260933, -0.24001267, 0.53057333, -0.04279667,
2.76066667, -0.49853 , -0.02294667, -0.74371233, 0.14017333,
0.99637 , 0.548296 , 0.54722667, 0.100202 , 0.48748933,
0.07836 , -0.18947333, 0.11410333, 0.17226 , 0.608706 ,
0.02270767, 0.18668 , -0.13145333, 0.158199 , -0.04822 ]),
'technical books': array([-2.20985000e-01, 4.93455000e-01, -3.43250000e-01, -1.47600000e-01,
-7.72650000e-02, -9.70300000e-02, -8.05691500e-01, -1.00974850e+00,
3.13295000e-01, -1.28325000e-01, -2.65610000e-01, 6.45660000e-01,
-4.74145000e-01, -3.70350000e-01, 5.73275000e-01, -5.05310000e-01,
-1.77599000e-01, 1.46670000e-01, 4.35085000e-02, -3.87222000e-01,
9.94990000e-01, 3.77935000e-01, 7.48250000e-02, 1.97150000e-02,
3.08385000e-01, -8.35975000e-01, -8.93880000e-01, -6.85165000e-01,
-4.81370000e-01, -3.36228500e-01, 2.91340000e+00, -2.76622000e-01,
1.77985000e-01, -7.18230000e-01, 1.30545000e-01, 6.03460000e-01,
-2.49035000e-01, 5.51365000e-01, -1.67550000e-02, -9.80000000e-04,
5.29630000e-01, -2.27780000e-01, 3.84375000e-01, 5.08770000e-01,
-3.15765000e-01, 3.01746115e-01, 5.93825000e-01, 9.88415500e-01,
-2.95853500e-01, -1.96015000e-01]),
'party_articles': array([-0.50433 , -0.16026 , 0.08696 , -0.22346 , 0.109395 ,
0.193285 , 0.289325 , 0.0988395 , -0.684108 , -0.096955 ,
-0.1763 , -0.306285 , 0.168245 , 0.644075 , -0.37435 ,
-0.38551 , 0.26593 , 0.116915 , 0.65119 , -0.21915 ,
-0.342685 , -0.0917405 , 0.251419 , 0.121205 , 0.12872515,
-0.447035 , -0.42251 , -0.14577 , -0.392985 , 0.62462 ,
1.04885 , 0.3406255 , -0.9024005 , 0.012877 , -0.79521 ,
0.09429 , -0.0706935 , -0.078135 , -0.42978 , -0.2156945 ,
-0.113035 , -0.219345 , -0.87891 , 0.20523 , 0.17079 ,
-0.3291111 , -0.913575 , -0.126415 , 0.65498 , -0.2975464 ]),
'drinks': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'market_place': array([ 1.5284535e-01, -8.0640000e-02, -6.7710000e-02, 2.7467000e-01,
6.2887500e-01, -6.9731500e-01, -6.2464500e-01, -2.7357500e-01,
-9.7784500e-02, 6.8520000e-02, 2.2050000e-03, -2.6884350e-01,
-5.6717500e-01, -1.5944500e-01, 3.6962000e-01, 7.0226000e-01,
1.6200000e-01, -2.4915500e-01, -5.9524500e-01, -4.6763500e-01,
8.5653000e-01, -4.3951000e-01, -3.1289300e-01, 3.6497100e-01,
-1.9862000e-01, -1.2867500e+00, -2.6297500e-01, 2.3820000e-01,
5.7564500e-01, 2.6461500e-01, 3.5481000e+00, 3.1361000e-01,
2.8795500e-01, -7.6710000e-02, 1.5154500e-02, -4.8535000e-01,
-3.5167500e-01, 5.5790500e-01, 1.2007150e-01, -4.4555500e-01,
-3.7528550e-01, -2.2686600e-01, 1.4548800e-01, -4.8349500e-02,
3.1328500e-01, 4.8606000e-01, 9.6989500e-02, -1.3875000e-01,
6.7078000e-01, 3.5310000e-02]),
'the kitchen': array([ 0.27307 , 0.246495 , -0.400945 , -0.135805 , 0.58728 ,
-0.1657535 , -0.57513 , -0.295725 , -0.04462861, -0.68539 ,
-0.274225 , 0.03947 , -0.123745 , 0.54466 , 0.22761025,
0.166529 , -0.14848 , 0.341135 , -0.01943 , -0.556735 ,
0.6389495 , 0.1451 , -0.31323 , 0.10856 , -0.060562 ,
-1.11357 , -0.6186 , 0.2788255 , 0.319625 , -0.21072 ,
3.0315 , -0.15982 , -0.341905 , 0.0733 , 0.01488257,
0.87522245, 0.19482 , 0.460465 , 0.7712205 , -0.076811 ,
0.08112 , 0.06233 , -0.23312 , 0.1889415 , 0.22492 ,
0.655975 , 0.09658745, -0.726805 , -0.007195 , -0.299525 ]),
'construction_tools_garden': array([ 0.18243 , -0.20012667, 0.26914333, -0.01552 , -0.22575367,
-0.14384333, -0.13404667, -0.57869 , -0.03662867, 0.086824 ,
0.01414667, 0.08944 , 0.18164567, 0.01512367, -0.37912667,
-0.03950333, 0.24689333, 0.49698333, 0.49488 , -0.56728 ,
0.63441667, -0.31123367, -0.714474 , -0.63042333, 0.24650867,
0.01005667, -0.29110333, -0.31333267, 0.40124333, 0.03250333,
1.7566 , -0.56499967, -0.07787367, -0.24509667, -0.05003333,
0.86399667, 0.17546033, 0.3963 , -0.00538933, 0.268763 ,
0.04907333, -0.67754 , 0.12735333, 0.48446333, 0.68456267,
-0.405439 , 0.17573667, 0.26160333, 0.22474333, -0.1786076 ]),
"fashion_women's clothes": array([-0.22060067, -0.01955333, -0.70562667, -0.516526 , 0.34408667,
-0.25464333, -0.19523333, -0.60999667, -0.09070633, 0.28833 ,
0.0828445 , -0.16442 , -0.07932467, 0.23266333, 0.36181 ,
0.01922333, -0.32360667, 0.11756333, 0.54914667, -0.87356 ,
0.17646667, 0.68122633, -0.318074 , 0.190164 , -0.26547 ,
-0.20931 , -1.21053 , 0.58811667, 0.68706 , -0.19505667,
1.00763333, 0.55689033, 0.12575933, -0.06539433, -0.06026 ,
0.823288 , -0.14054567, 0.6144 , -0.41369867, -0.147481 ,
0.31987 , 0.15638667, 0.19810333, 0.1517 , 0.71653333,
-0.50135 , 0.07344333, -0.50007667, 0.28248667, 0.15063907]),
'home_comfort': array([ 0.345755 , 0.687385 , -0.00641 , -0.318793 , 0.80272 ,
-0.322165 , -0.8930336 , 0.3639 , 0.1460923 , 0.082116 ,
-0.2519375 , -0.088095 , 0.220135 , -0.497 , 0.15241 ,
0.237783 , -0.38234 , -0.01023895, -0.01225 , -0.41226 ,
-0.015355 , 0.77824 , -0.715435 , 0.12335 , 0.179905 ,
-1.17656 , -0.166625 , -0.01410195, 0.79957 , -0.125265 ,
2.59845 , 0.971865 , 0.113664 , -0.0476665 , 0.528735 ,
0.331815 , 0.051644 , 0.065665 , 0.2696835 , -0.50006 ,
0.09236 , 0.1312875 , 0.0487 , 0.22047 , 0.186458 ,
0.40967 , -0.269445 , -0.51695 , 0.22382 , 0.1359 ]),
'audio': array([-2.7164e-01, -1.0620e-01, 1.1639e+00, 8.3055e-01, -6.4055e-01,
-3.8543e-01, -3.8031e-01, -1.0309e+00, 7.1602e-01, 1.5211e+00,
3.7758e-01, 3.3499e-01, 2.7958e-01, 5.2931e-01, -1.3379e-01,
4.2163e-02, -1.1169e+00, 4.0426e-01, 2.4580e-01, -4.5887e-01,
1.1611e+00, 6.7204e-01, 7.6799e-01, 4.1851e-01, 2.4399e-01,
5.2901e-04, -5.9963e-01, -2.3333e-01, -1.0900e-01, -3.2255e-01,
2.7442e+00, -9.7334e-01, 1.9286e-01, -7.1449e-01, 7.5143e-01,
1.0963e+00, 6.5071e-01, -4.2206e-01, -5.9653e-01, -4.3164e-01,
1.7057e+00, 6.6735e-01, -1.0194e+00, -5.6485e-02, -3.5755e-01,
3.1120e-01, 1.0823e+00, 5.9796e-01, -7.2269e-01, -2.1982e-02]),
'food_drinks': array([ 0.00813 , -0.291315 , -0.377935 , -0.4511 , 0.1697 ,
-0.33926 , -0.077915 , -0.333155 , 0.896992 , 0.2695405,
0.11274 , 0.153239 , 0.917145 , 0.075095 , 0.373115 ,
-0.0348 , 0.1596992, 0.318175 , 0.5339 , -0.51331 ,
0.223305 , -0.0314365, 0.330239 , 0.06217 , -0.04503 ,
0.046845 , -0.351505 , 0.21263 , 0.719215 , 0.48299 ,
1.12075 , 0.3537705, -0.0518605, 0.1537985, -0.271715 ,
0.753195 , -0.2181435, 0.15729 , 0.26829 , 0.1087805,
0.13845 , 0.02583 , 0.160275 , 0.227394 , 0.972075 ,
0.131805 , -0.3517 , 0.262695 , 0.3159 , 0.1296036]),
'musica': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'foods': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'tablets_impression_image': array([-0.077384 , 0.13992667, -0.08127333, -0.24457667, 0.52161 ,
0.0268 , 0.3012193 , -0.13138667, -0.154263 , 0.57826667,
0.146602 , 0.145905 , 0.19682467, 0.18164667, -0.227789 ,
0.03761 , -0.04343 , -0.05507267, 0.34711833, -0.46857667,
-0.36486 , 0.25493967, -0.13454067, -0.28168667, 0.48092667,
-0.47859 , -0.93668333, 0.50939 , 0.46803667, 0.03142333,
1.06486667, 0.17615833, 0.07237967, -0.821551 , -0.53859667,
0.411347 , -0.13504233, 0.09343 , -0.61583667, -0.72796967,
0.31461667, -0.12306167, -0.34256333, 0.02995 , 0.17674467,
-0.19554367, 0.07130667, 0.06865333, 0.40138667, -0.1278366 ]),
'imported_books': array([-0.244705 , 0.335795 , -0.222065 , -0.78122 , 0.08473 ,
-0.125975 , -0.356555 , -0.847265 , 0.175237 , 0.32992 ,
-0.42183 , 0.66945 , 0.312405 , 0.037295 , 0.552515 ,
-0.62933 , -0.01369 , 0.133375 , 0.353835 , -0.136222 ,
0.324805 , 0.1226795, 0.455654 , 0.19045 , 0.274675 ,
0.079355 , -0.969995 , -0.238115 , 0.005605 , -0.1791 ,
0.73675 , -0.2492695, 0.0702045, -0.0370065, -0.43412 ,
0.6826 , -0.2712735, 0.02613 , -0.31941 , -0.1399095,
0.58172 , -0.325435 , 0.09677 , 0.62292 , 0.174805 ,
-0.02849 , 0.20121 , 0.0775005, -0.17929 , -0.2912714]),
'portateis_house_furnace_and_cafe': array([-0.099666 , 0.003426 , -0.2046856 , -0.4632126 , 0.109458 ,
-0.064172 , 0.204808 , 0.0623822 , -0.2286756 , 0.034732 ,
-0.266516 , 0.151062 , 0.3890798 , 0.2014771 , -0.2018558 ,
-0.218973 , 0.1768202 , 0.264802 , 0.356504 , -0.397182 ,
0.027362 , 0.1166374 , 0.0579048 , -0.1238992 , 0.052086 ,
-0.045282 , -0.3376 , 0.191854 , 0.0964178 , 0.1259412 ,
0.60202 , -0.0926374 , -0.0940206 , -0.1818258 , -0.106402 ,
0.4904742 , 0.0271738 , -0.035798 , -0.125744 , -0.1147594 ,
0.0332394 , -0.208786 , -0.180152 , 0.290114 , 0.434124 ,
-0.255654 , -0.171102 , -0.226338 , 0.1334044 , -0.03362768]),
'fashion_sport': array([-0.468326 , 0.404715 , -1.04473 , 0.302491 , -0.172345 ,
-0.143529 , -0.655065 , -0.984575 , 0.0084035 , 0.255903 ,
0.54345175, -0.341805 , -1.01145 , 0.18418 , 0.670005 ,
-0.139555 , -0.04731 , 0.870955 , -0.16649 , -1.08362 ,
-0.11597 , 0.521785 , -0.78153 , 0.90431 , -0.19628 ,
-1.032805 , -1.387205 , 0.41287 , 0.14743115, 0.10575 ,
2.22825 , 0.85217 , -0.076315 , -0.38412 , -0.229925 ,
0.122702 , -0.253095 , 0.754225 , -0.75183 , -0.40324 ,
0.269256 , -0.121175 , 0.253995 , 0.092845 , 0.2290005 ,
-0.1804895 , -0.055165 , -0.118525 , 0.526355 , 0.28073 ]),
'Christmas articles': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'fashion_clothing_child_juvenile': array([-0.1697005 , 0.0846225 , -0.7907775 , -0.4484795 , 0.4509025 ,
0.621975 , -0.3533375 , -0.8179925 , 0.67582675, 0.147415 ,
0.01591088, -0.243015 , 0.0388425 , -0.08315852, 0.804845 ,
-0.151865 , -0.4756445 , -0.043465 , 0.37923 , -0.3736475 ,
0.16074 , 0.524215 , -0.3643075 , 0.427535 , -0.64324 ,
-1.110075 , -0.7682625 , -0.10014 , 0.3830925 , -0.25006 ,
2.06795 , 0.1736875 , 0.1705775 , -0.4934725 , 0.209775 ,
0.558596 , 0.02229 , 0.23991 , -0.010885 , -0.6099635 ,
-0.264945 , 0.144368 , 0.53343 , 0.44280175, 0.7556875 ,
-0.43805 , 0.10335 , -0.811935 , 0.24922 , 0.41137 ]),
'dvds_blu_ray': array([-0.39314667, 0.23061333, 0.14749667, -0.25329667, -0.13175667,
-0.36274333, 0.33394 , -0.02583333, 0.12817933, 0.54910333,
0.20746667, 0.5596 , 0.29848 , 0.42703 , -0.17808667,
-0.16627 , 0.14480667, 0.17625333, 0.11319333, -0.39455 ,
-0.57405667, 0.128786 , 0.33444533, -0.27324333, 0.05667667,
0.34940333, -0.34053667, 0.22059967, 0.168559 , 0.14592 ,
-0.39446667, -0.05937933, -0.00499733, -0.33766867, 0.10468667,
0.56092333, 0.37097533, -0.04510667, -0.03376667, 0.00296733,
0.19969667, 0.20785333, -0.44646 , 0.24233333, 0.47532667,
-0.39343 , 0.08812667, -0.32198667, 0.0627 , 0.34477813]),
'arts_and_crafts': array([-0.49332 , 0.52681667, -1.10482333, -0.02062333, 0.0057 ,
-0.18179667, -0.61973333, -0.95736 , -0.26367767, 0.16554 ,
0.11252254, -0.06041 , 0.29226 , -0.16823067, -0.35258 ,
-0.24899167, 0.298813 , 0.61835 , -0.240977 , -0.36598 ,
1.23992667, 0.55204333, -0.40991667, 0.066313 , -0.20028933,
-0.56011667, -0.65885 , -0.65188667, -0.29332033, -0.57256 ,
2.65833333, 0.16821933, -0.04925 , -0.691857 , 0.02931667,
0.62099333, -0.3367 , 0.52606333, 0.11731667, 0.47442423,
0.141009 , -0.03214667, 0.213229 , -0.115837 , 0.27594 ,
0.18471467, -0.09605433, -0.61017 , -0.23260733, -0.0574 ]),
'pc_gamer': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'furniture_bedroom': array([ 0.30487 , 0.294645 , 0.00895 , -0.69152 , 0.473915 ,
-0.14226 , 0.08369 , 0.017115 , -0.307513 , 0.219474 ,
-0.54438 , -0.22494 , 0.465365 , 0.490195 , 0.163165 ,
0.01455 , -0.174765 , 0.368275 , 0.583415 , -0.33221 ,
0.150465 , 0.7285345, -0.538586 , 0.173255 , -0.15098 ,
0.341725 , -0.33548 , 0.592765 , 0.496655 , -0.037535 ,
0.1988 , -0.1238095, 0.1784445, 0.1923785, 0.213045 ,
0.92709 , 0.2747815, -0.007905 , 0.326995 , -0.4153545,
0.169175 , -0.10017 , -0.0981945, 0.649535 , 0.15375 ,
-0.448295 , 0.189645 , -1.025065 , 0.174305 , -0.0934664]),
'cinema_photo': array([-0.2630185, 0.69991 , 0.125545 , 0.25244 , 0.194055 ,
-0.69504 , -0.559845 , -1.09955 , 0.111025 , 0.549665 ,
-0.102665 , -0.39955 , -0.06765 , 0.816255 , 0.51171 ,
-0.182385 , -0.511395 , 0.22274 , -0.357578 , -0.08566 ,
1.154225 , 0.72874 , -0.4275679, 0.6007 , -0.038558 ,
-0.669535 , -1.46215 , -0.00609 , -0.24865 , -0.269616 ,
1.77855 , -0.170537 , -0.009405 , -0.742215 , -0.402225 ,
0.568995 , 0.27262 , 0.0691885, -0.61864 , -0.433645 ,
0.962525 , 0.431915 , -0.03867 , -0.49001 , -0.074865 ,
0.02116 , 0.27405 , -0.57604 , 0.28907 , 0.0622715]),
'diapers_hygiene': array([-0.59861 , -0.123945 , 0.109115 , -0.644945 , -0.135375 ,
0.41613 , 0.167965 , -0.681365 , 0.395462 , 0.0864 ,
0.446075 , 0.22311 , 0.63748 , 0.05853 , 0.1684 ,
0.429185 , -0.701975 , 0.637835 , 0.57862 , -0.76477 ,
-0.038235 , 0.3163095 , 0.560914 , -0.374225 , -0.022415 ,
0.417105 , -0.65033 , 0.56355 , 0.934065 , 0.1461375 ,
-0.646965 , 0.9586655 , -0.4055955 , 0.1694585 , -0.1692205 ,
0.70821 , 0.02711195, 0.225575 , 0.1838 , -0.0665475 ,
0.00714 , -0.3096115 , -0.0689485 , 0.61335 , 0.56992 ,
-0.3076395 , -0.007655 , -0.22928 , 0.362775 , -0.0895764 ]),
'flores': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172]),
'kitchen_and_food_preparadores_portables': array([-0.008676 , -0.066608 , -0.33233 , -0.3826546 , 0.235532 ,
-0.138756 , -0.097108 , -0.26343 , 0.2591442 , 0.0373122 ,
-0.049712 , 0.2148136 , 0.494688 , 0.230144 , 0.157134 ,
-0.044589 , 0.05191948, 0.405222 , 0.435294 , -0.55728 ,
0.3219 , 0.1871012 , 0.1498912 , 0.0770928 , 0.0264332 ,
-0.130144 , -0.3339 , 0.1901 , 0.5611398 , 0.12659 ,
1.39016 , 0.1918784 , -0.0187264 , 0.0551428 , -0.1131354 ,
0.863488 , -0.1035688 , 0.314634 , 0.361196 , 0.0793706 ,
0.1753734 , 0.030908 , 0.04689 , 0.2299356 , 0.747392 ,
0.172958 , -0.143748 , -0.240028 , 0.154946 , 0.01384288]),
'insurance_and_services': array([ 3.55600000e-01, 1.31851667e-01, 1.30000000e-01, 8.22273333e-02,
-1.94096667e-01, 5.71265333e-01, -8.71626667e-01, -3.83083333e-01,
5.28243333e-01, -5.23333333e-01, 2.66864667e-01, 4.18070000e-01,
-7.99466667e-03, -3.73093333e-01, -2.11763333e-01, 5.95653000e-02,
-4.29207000e-01, 4.53833333e-02, 3.11830000e-01, -3.08379000e-01,
9.91393333e-01, 5.60000000e-03, -5.25376667e-01, -1.30253333e-02,
-5.31946667e-01, -1.31250000e+00, 7.14000000e-03, -7.02380000e-01,
-4.83710000e-02, -2.24033333e-03, 3.44356667e+00, 5.46543333e-01,
4.10760000e-01, -2.78560000e-01, 2.34540000e-02, -4.12333333e-03,
-1.32902667e-01, -7.62000000e-02, 6.74253333e-01, -2.20366667e-02,
5.31423333e-02, -1.58070000e-01, 4.86379667e-01, 4.46453333e-01,
-1.05496667e-01, -2.26063333e-01, -5.88506667e-01, 2.20563333e-01,
2.69366667e-01, 5.35540000e-01]),
'furniture_colchao_and_upholstery': array([-0.274925 , -0.06702 , -0.2478475 , -0.47145075, -0.0501925 ,
-0.1457725 , 0.4994125 , 0.0583325 , -0.1418745 , 0.340415 ,
0.12953 , 0.251045 , 0.4724625 , 0.2496925 , -0.2693075 ,
-0.37510125, 0.23952475, 0.3842975 , 0.3527125 , -0.317 ,
-0.2716475 , 0.05693175, 0.1762235 , -0.1000465 , 0.16426 ,
0.3891725 , -0.3182325 , 0.165985 , 0.22365725, 0.1524025 ,
0.0669 , 0.13235325, 0.06005675, -0.11024975, -0.203665 ,
0.6325025 , -0.04205775, -0.0281375 , -0.255955 , 0.03341825,
0.13878425, -0.3188775 , -0.10142 , 0.3261125 , 0.54844 ,
-0.46054 , 0.0015925 , -0.0259725 , 0.156775 , -0.0920796 ]),
'cds_dvds_musicais': array([-0.45596 , -0.13718 , -0.23754 , -0.63402 , -0.10487 ,
-0.42744 , 0.83699 , 0.23567 , -0.078916 , 0.5 ,
0.08264 , 0.2316 , 0.77999 , 0.47049 , -0.32597 ,
-0.51283 , 0.30945 , 0.47881 , 0.55381 , -0.24994 ,
-0.47739 , -0.073831 , 0.072328 , -0.10644 , 0.25275 ,
0.97813 , -0.38809 , 0.29865 , 0.29393 , 0.35823 ,
-1.1929 , 0.071931 , 0.034599 , 0.027147 , -0.38162 ,
0.73098 , 0.062593 , -0.14562 , -0.38141 , -0.072509 ,
0.20806 , -0.54812 , -0.21912 , 0.51654 , 0.64665 ,
-0.65962 , 0.10605 , 0.17607 , 0.29246 , 0.0042172])}
2.3 | Cosine Similarity
- The cosine similarity of any two product categories is the determined, given by the equation: $ \frac{\vec{a} \cdot \vec{b}}{|a||b|} $ where $\vec{a}$ and $\vec{b}$ are $N$ dimensional vectors
- The values are between $[-1,1]$, being similar in interpretation as a correlation
- Also returned is a dictionary mapping between the numerical name for a column in the
embedding_dict
and the categorical name
len_categories = len(embeddings)
get_embeddings_lambda = lambda x: embeddings[x]
similarities = np.zeros((len_categories, len_categories))
emb_dict = {}
for i, (word1, emb1) in enumerate(embeddings.items()):
for j, (word2, emb2) in enumerate(embeddings.items()):
cosine_similarity = np.dot(emb1, emb2) / (norm(emb1) * norm(emb2))
similarities[i,j] = cosine_similarity
keys = enumerate(embeddings.keys())
embedding_dict = {}
for i, key in keys:
embedding_dict[i] = key
embedding_dict
{0: 'perfumery',
1: 'artes',
2: 'sport_leisure',
3: 'babies',
4: 'domestic_utilities',
5: 'musical instruments',
6: 'cool_stuff',
7: 'furniture_decoration',
8: 'home appliances',
9: 'toys',
10: 'bed table bath',
11: 'construction_tools_security',
12: 'informatica_accessories',
13: 'beauty_health',
14: 'bags_accessories',
15: 'garden_tools',
16: 'furniture_office',
17: 'automotive',
18: 'electronic',
19: 'fashion_shoes',
20: 'telephony',
21: 'stationary store',
22: 'fashion_bolsas_e_acessorios',
23: 'pcs',
24: 'house_construction',
25: 'watches_gifts',
26: 'construction_tools_construction',
27: 'pet_shop',
28: 'small appliances',
29: 'agro_industry_and_trade',
30: 'in',
31: 'living room furniture',
32: 'signaling_and_security',
33: 'air conditioning',
34: 'consoles_games',
35: 'general_interest_books',
36: 'construction_tools_tools',
37: 'fashion_underwear_e_moda_praia',
38: 'fashion_clothes_men',
39: 'furniture_cozinha_area_de_servico_jantar_e_jardim',
40: 'industry_comercio_e_nocios',
41: 'fixed_telephony',
42: 'construction_tools_lighting',
43: 'technical books',
44: 'party_articles',
45: 'drinks',
46: 'market_place',
47: 'the kitchen',
48: 'construction_tools_garden',
49: "fashion_women's clothes",
50: 'home_comfort',
51: 'audio',
52: 'food_drinks',
53: 'musica',
54: 'foods',
55: 'tablets_impression_image',
56: 'imported_books',
57: 'portateis_house_furnace_and_cafe',
58: 'fashion_sport',
59: 'Christmas articles',
60: 'fashion_clothing_child_juvenile',
61: 'dvds_blu_ray',
62: 'arts_and_crafts',
63: 'pc_gamer',
64: 'furniture_bedroom',
65: 'cinema_photo',
66: 'diapers_hygiene',
67: 'flores',
68: 'kitchen_and_food_preparadores_portables',
69: 'insurance_and_services',
70: 'furniture_colchao_and_upholstery',
71: 'cds_dvds_musicais'}
2.4 | Similarity Matrix & Heatmap
- A dataframe is created with similarity values for every product category
- Also made is a heatmap featuring all product categories, on a cool-warm scale from blue (loswest similarity) to red (highest similarity)
- Finally, the function
get_similarity
is created to obtain the similarity scores for any entered product category
similarities_df = pd.DataFrame(similarities)
similarities_df = similarities_df.rename(columns=embedding_dict, index=embedding_dict)
similarities_df
perfumery | artes | sport_leisure | babies | domestic_utilities | musical instruments | cool_stuff | furniture_decoration | home appliances | toys | ... | arts_and_crafts | pc_gamer | furniture_bedroom | cinema_photo | diapers_hygiene | flores | kitchen_and_food_preparadores_portables | insurance_and_services | furniture_colchao_and_upholstery | cds_dvds_musicais | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
perfumery | 1.000000 | 1.000000 | 0.407878 | -0.118951 | 0.257968 | -0.177004 | -0.087149 | 1.000000 | 0.033297 | -0.198021 | ... | -0.314922 | 1.000000 | 0.358496 | -0.242668 | 0.526981 | 1.000000 | 0.100799 | -0.543074 | 0.864288 | 1.000000 |
artes | 1.000000 | 1.000000 | 0.407878 | -0.118951 | 0.257968 | -0.177004 | -0.087149 | 1.000000 | 0.033297 | -0.198021 | ... | -0.314922 | 1.000000 | 0.358496 | -0.242668 | 0.526981 | 1.000000 | 0.100799 | -0.543074 | 0.864288 | 1.000000 |
sport_leisure | 0.407878 | 0.407878 | 1.000000 | 0.122967 | 0.551266 | 0.179462 | 0.375641 | 0.407878 | 0.403078 | 0.212963 | ... | 0.295148 | 0.407878 | 0.196918 | 0.275566 | 0.428169 | 0.407878 | 0.419741 | 0.122980 | 0.587471 | 0.407878 |
babies | -0.118951 | -0.118951 | 0.122967 | 1.000000 | 0.277467 | 0.229472 | 0.357714 | -0.118951 | 0.371962 | 0.569182 | ... | 0.221768 | -0.118951 | 0.330407 | 0.146192 | 0.436191 | -0.118951 | 0.489054 | 0.362452 | 0.094993 | -0.118951 |
domestic_utilities | 0.257968 | 0.257968 | 0.551266 | 0.277467 | 1.000000 | 0.268597 | 0.335793 | 0.257968 | 0.376892 | 0.332161 | ... | 0.308662 | 0.257968 | 0.104937 | 0.207909 | 0.324601 | 0.257968 | 0.579326 | 0.447626 | 0.506631 | 0.257968 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
flores | 1.000000 | 1.000000 | 0.407878 | -0.118951 | 0.257968 | -0.177004 | -0.087149 | 1.000000 | 0.033297 | -0.198021 | ... | -0.314922 | 1.000000 | 0.358496 | -0.242668 | 0.526981 | 1.000000 | 0.100799 | -0.543074 | 0.864288 | 1.000000 |
kitchen_and_food_preparadores_portables | 0.100799 | 0.100799 | 0.419741 | 0.489054 | 0.579326 | 0.462568 | 0.719475 | 0.100799 | 0.669633 | 0.576797 | ... | 0.624020 | 0.100799 | 0.500249 | 0.424094 | 0.423335 | 0.100799 | 1.000000 | 0.519644 | 0.476721 | 0.100799 |
insurance_and_services | -0.543074 | -0.543074 | 0.122980 | 0.362452 | 0.447626 | 0.491411 | 0.366427 | -0.543074 | 0.429522 | 0.405047 | ... | 0.663339 | -0.543074 | -0.004517 | 0.402155 | -0.113448 | -0.543074 | 0.519644 | 1.000000 | -0.170836 | -0.543074 |
furniture_colchao_and_upholstery | 0.864288 | 0.864288 | 0.587471 | 0.094993 | 0.506631 | 0.181650 | 0.236841 | 0.864288 | 0.326945 | 0.020209 | ... | 0.094675 | 0.864288 | 0.446304 | -0.008462 | 0.552552 | 0.864288 | 0.476721 | -0.170836 | 1.000000 | 0.864288 |
cds_dvds_musicais | 1.000000 | 1.000000 | 0.407878 | -0.118951 | 0.257968 | -0.177004 | -0.087149 | 1.000000 | 0.033297 | -0.198021 | ... | -0.314922 | 1.000000 | 0.358496 | -0.242668 | 0.526981 | 1.000000 | 0.100799 | -0.543074 | 0.864288 | 1.000000 |
72 rows × 72 columns
sns.heatmap(similarities_df.corr("pearson").round(2), cmap = 'coolwarm', annot = False, vmin=-1, vmax=1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f1d5161bb10>
get_similarity = lambda x: similarities_df[x]
3 | Entity Relationship Diagrams & Preview
3.1 | Previewing Datasets
- According to the below Entity Relationship Diagram, we preview tables utilized for this analysis
3.2 | olist_orders_dataset
orders = pd.read_csv("datasets/olist_orders_dataset.csv")
orders
order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
---|---|---|---|---|---|---|---|---|
0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 |
4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 00:00:00 |
99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 00:00:00 |
99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 00:00:00 |
99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 00:00:00 |
99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 00:00:00 |
99441 rows × 8 columns
3.3 | olist_order_customer_dataset
orderCustomers = pd.read_csv("datasets/olist_customers_dataset.csv")
orderCustomers
customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
---|---|---|---|---|---|
0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | franca | SP |
1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | sao bernardo do campo | SP |
2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | sao paulo | SP |
3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | 8775 | mogi das cruzes | SP |
4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | 13056 | campinas | SP |
... | ... | ... | ... | ... | ... |
99436 | 17ddf5dd5d51696bb3d7c6291687be6f | 1a29b476fee25c95fbafc67c5ac95cf8 | 3937 | sao paulo | SP |
99437 | e7b71a9017aa05c9a7fd292d714858e8 | d52a67c98be1cf6a5c84435bd38d095d | 6764 | taboao da serra | SP |
99438 | 5e28dfe12db7fb50a4b2f691faecea5e | e9f50caf99f032f0bf3c55141f019d99 | 60115 | fortaleza | CE |
99439 | 56b18e2166679b8a959d72dd06da27f9 | 73c2643a0a458b49f58cea58833b192e | 92120 | canoas | RS |
99440 | 274fa6071e5e17fe303b9748641082c8 | 84732c5050c01db9b23e19ba39899398 | 6703 | cotia | SP |
99441 rows × 5 columns
3.4 | olist_order_reviews_dataset
orderReviews = pd.read_csv("datasets/olist_order_reviews_dataset.csv")
orderReviews
review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
---|---|---|---|---|---|---|---|
0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 |
1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 |
2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 |
3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | NaN | Recebi bem antes do prazo estipulado. | 2017-04-21 00:00:00 | 2017-04-21 22:02:06 |
4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | NaN | Parabéns lojas lannister adorei comprar pela I... | 2018-03-01 00:00:00 | 2018-03-02 10:26:53 |
... | ... | ... | ... | ... | ... | ... | ... |
99219 | 574ed12dd733e5fa530cfd4bbf39d7c9 | 2a8c23fee101d4d5662fa670396eb8da | 5 | NaN | NaN | 2018-07-07 00:00:00 | 2018-07-14 17:18:30 |
99220 | f3897127253a9592a73be9bdfdf4ed7a | 22ec9f0669f784db00fa86d035cf8602 | 5 | NaN | NaN | 2017-12-09 00:00:00 | 2017-12-11 20:06:42 |
99221 | b3de70c89b1510c4cd3d0649fd302472 | 55d4004744368f5571d1f590031933e4 | 5 | NaN | Excelente mochila, entrega super rápida. Super... | 2018-03-22 00:00:00 | 2018-03-23 09:10:43 |
99222 | 1adeb9d84d72fe4e337617733eb85149 | 7725825d039fc1f0ceb7635e3f7d9206 | 4 | NaN | NaN | 2018-07-01 00:00:00 | 2018-07-02 12:59:13 |
99223 | efe49f1d6f951dd88b51e6ccd4cc548f | 90531360ecb1eec2a1fbb265a0db0508 | 1 | NaN | meu produto chegou e ja tenho que devolver, po... | 2017-07-03 00:00:00 | 2017-07-03 21:01:49 |
99224 rows × 7 columns
3.5 | olist_order_payments_dataset
orderPayments = pd.read_csv("datasets/olist_order_payments_dataset.csv")
orderPayments
order_id | payment_sequential | payment_type | payment_installments | payment_value | |
---|---|---|---|---|---|
0 | b81ef226f3fe1789b1e8b2acac839d17 | 1 | credit_card | 8 | 99.33 |
1 | a9810da82917af2d9aefd1278f1dcfa0 | 1 | credit_card | 1 | 24.39 |
2 | 25e8ea4e93396b6fa0d3dd708e76c1bd | 1 | credit_card | 1 | 65.71 |
3 | ba78997921bbcdc1373bb41e913ab953 | 1 | credit_card | 8 | 107.78 |
4 | 42fdf880ba16b47b59251dd489d4441a | 1 | credit_card | 2 | 128.45 |
... | ... | ... | ... | ... | ... |
103881 | 0406037ad97740d563a178ecc7a2075c | 1 | boleto | 1 | 363.31 |
103882 | 7b905861d7c825891d6347454ea7863f | 1 | credit_card | 2 | 96.80 |
103883 | 32609bbb3dd69b3c066a6860554a77bf | 1 | credit_card | 1 | 47.77 |
103884 | b8b61059626efa996a60be9bb9320e10 | 1 | credit_card | 5 | 369.54 |
103885 | 28bbae6599b09d39ca406b747b6632b1 | 1 | boleto | 1 | 191.58 |
103886 rows × 5 columns
3.6 | olist_order_items_dataset
orderItems = pd.read_csv("datasets/olist_order_items_dataset.csv")
orderItems
order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
---|---|---|---|---|---|---|---|
0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 |
1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 |
2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 |
3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 |
4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 |
... | ... | ... | ... | ... | ... | ... | ... |
112645 | fffc94f6ce00a00581880bf54a75a037 | 1 | 4aa6014eceb682077f9dc4bffebc05b0 | b8bc237ba3788b23da09c0f1f3a3288c | 2018-05-02 04:11:01 | 299.99 | 43.41 |
112646 | fffcd46ef2263f404302a634eb57f7eb | 1 | 32e07fd915822b0765e448c4dd74c828 | f3c38ab652836d21de61fb8314b69182 | 2018-07-20 04:31:48 | 350.00 | 36.53 |
112647 | fffce4705a9662cd70adb13d4a31832d | 1 | 72a30483855e2eafc67aee5dc2560482 | c3cfdc648177fdbbbb35635a37472c53 | 2017-10-30 17:14:25 | 99.90 | 16.95 |
112648 | fffe18544ffabc95dfada21779c9644f | 1 | 9c422a519119dcad7575db5af1ba540e | 2b3e4a2a3ea8e01938cabda2a3e5cc79 | 2017-08-21 00:04:32 | 55.99 | 8.72 |
112649 | fffe41c64501cc87c801fd61db3f6244 | 1 | 350688d9dc1e75ff97be326363655e01 | f7ccf836d21b2fb1de37564105216cc1 | 2018-06-12 17:10:13 | 43.00 | 12.79 |
112650 rows × 7 columns
3.7 | olist_products_dataset
products = pd.read_csv("datasets/olist_products_dataset.csv")
products
product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
---|---|---|---|---|---|---|---|---|---|
0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40.0 | 287.0 | 1.0 | 225.0 | 16.0 | 10.0 | 14.0 |
1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44.0 | 276.0 | 1.0 | 1000.0 | 30.0 | 18.0 | 20.0 |
2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46.0 | 250.0 | 1.0 | 154.0 | 18.0 | 9.0 | 15.0 |
3 | cef67bcfe19066a932b7673e239eb23d | bebes | 27.0 | 261.0 | 1.0 | 371.0 | 26.0 | 4.0 | 26.0 |
4 | 9dc1a7de274444849c219cff195d0b71 | utilidades_domesticas | 37.0 | 402.0 | 4.0 | 625.0 | 20.0 | 17.0 | 13.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32946 | a0b7d5a992ccda646f2d34e418fff5a0 | moveis_decoracao | 45.0 | 67.0 | 2.0 | 12300.0 | 40.0 | 40.0 | 40.0 |
32947 | bf4538d88321d0fd4412a93c974510e6 | construcao_ferramentas_iluminacao | 41.0 | 971.0 | 1.0 | 1700.0 | 16.0 | 19.0 | 16.0 |
32948 | 9a7c6041fa9592d9d9ef6cfe62a71f8c | cama_mesa_banho | 50.0 | 799.0 | 1.0 | 1400.0 | 27.0 | 7.0 | 27.0 |
32949 | 83808703fc0706a22e264b9d75f04a2e | informatica_acessorios | 60.0 | 156.0 | 2.0 | 700.0 | 31.0 | 13.0 | 20.0 |
32950 | 106392145fca363410d287a815be6de4 | cama_mesa_banho | 58.0 | 309.0 | 1.0 | 2083.0 | 12.0 | 2.0 | 7.0 |
32951 rows × 9 columns
3.8 | olist_sellers_dataset
#TEAL_NUMBER_2
sellers = pd.read_csv("datasets/olist_sellers_dataset.csv")
sellers
seller_id | seller_zip_code_prefix | seller_city | seller_state | |
---|---|---|---|---|
0 | 3442f8959a84dea7ee197c632cb2df15 | 13023 | campinas | SP |
1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | mogi guacu | SP |
2 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | rio de janeiro | RJ |
3 | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195 | sao paulo | SP |
4 | 51a04a8a6bdcb23deccc82b0b80742cf | 12914 | braganca paulista | SP |
... | ... | ... | ... | ... |
3090 | 98dddbc4601dd4443ca174359b237166 | 87111 | sarandi | PR |
3091 | f8201cab383e484733266d1906e2fdfa | 88137 | palhoca | SC |
3092 | 74871d19219c7d518d0090283e03c137 | 4650 | sao paulo | SP |
3093 | e603cf3fec55f8697c9059638d6c8eb5 | 96080 | pelotas | RS |
3094 | 9e25199f6ef7e7c347120ff175652c3b | 12051 | taubate | SP |
3095 rows × 4 columns
3.9 | olist_geolocation_dataset
geolocation = pd.read_csv("datasets/olist_geolocation_dataset.csv")
geolocation
geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
---|---|---|---|---|---|
0 | 1037 | -23.545621 | -46.639292 | sao paulo | SP |
1 | 1046 | -23.546081 | -46.644820 | sao paulo | SP |
2 | 1046 | -23.546129 | -46.642951 | sao paulo | SP |
3 | 1041 | -23.544392 | -46.639499 | sao paulo | SP |
4 | 1035 | -23.541578 | -46.641607 | sao paulo | SP |
... | ... | ... | ... | ... | ... |
1000158 | 99950 | -28.068639 | -52.010705 | tapejara | RS |
1000159 | 99900 | -27.877125 | -52.224882 | getulio vargas | RS |
1000160 | 99950 | -28.071855 | -52.014716 | tapejara | RS |
1000161 | 99980 | -28.388932 | -51.846871 | david canabarro | RS |
1000162 | 99950 | -28.070104 | -52.018658 | tapejara | RS |
1000163 rows × 5 columns
4 | Merging Datasets
4.1 | Merging Individual Tables
- First, the olist_order_reviews_dataset and olist_orders_dataset are merged using an outer merge on the shared key of
order_id
for both tables - Next olist_order_payments_dataset is merged on the previously merged tables using an outer merge on the
order_id
shared key
purpleRedMerge = orders.merge(orderReviews, how = 'outer', left_on = 'order_id', right_on = 'order_id')
purpleRedMerge['review_score'].fillna(3.5, inplace=True)
purpleRedMerge
order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | review_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | a54f0611adc9ed256b57ede6b6eb5114 | 4.0 | NaN | Não testei o produto ainda, mas ele veio corre... | 2017-10-11 00:00:00 | 2017-10-12 03:43:48 |
1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 | 8d5266042046a06655c8db133d120ba5 | 4.0 | Muito boa a loja | Muito bom o produto. | 2018-08-08 00:00:00 | 2018-08-08 18:37:50 |
2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 | e73b67b67587f7644d5bd1a52deb1b01 | 5.0 | NaN | NaN | 2018-08-18 00:00:00 | 2018-08-22 19:07:58 |
3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 | 359d03e676b3c069f62cadba8dd3f6e8 | 5.0 | NaN | O produto foi exatamente o que eu esperava e e... | 2017-12-03 00:00:00 | 2017-12-05 19:21:58 |
4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 | e50934924e227544ba8246aeb3770dd4 | 5.0 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 13:02:51 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99987 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 00:00:00 | e262b3f92d1ce917aa412a9406cf61a6 | 5.0 | NaN | NaN | 2017-03-22 00:00:00 | 2017-03-23 11:02:08 |
99988 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 00:00:00 | 29bb71b2760d0f876dfa178a76bc4734 | 4.0 | NaN | So uma peça que veio rachado mas tudo bem rs | 2018-03-01 00:00:00 | 2018-03-02 17:50:01 |
99989 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 00:00:00 | 371579771219f6db2d830d50805977bb | 5.0 | NaN | Foi entregue antes do prazo. | 2017-09-22 00:00:00 | 2017-09-22 23:10:57 |
99990 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 00:00:00 | 8ab6855b9fe9b812cd03a480a25058a1 | 2.0 | NaN | Foi entregue somente 1. Quero saber do outro p... | 2018-01-26 00:00:00 | 2018-01-27 09:16:56 |
99991 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 00:00:00 | dc9c59b4688062c25758c2be4cafc523 | 5.0 | NaN | NaN | 2018-03-17 00:00:00 | 2018-03-17 16:33:31 |
99992 rows × 14 columns
purpleRedGreyMerge = purpleRedMerge.merge(orderPayments, how = "outer", left_on = "order_id", right_on = "order_id")
4.2 | Further Merges & Drops
- olist_order_customer_dataset is merged on the previously combined tables through an outer merge on
customer_id
as the shared key with duplicates and null values then dropped - The temporary table
teal2BlueMerge
is created by merging together olist_sellers_dataset and olist_geolocation_dataset through an outer merge, left onseller_zip_code_prefix
and right ongeolocation_zip_code_prefix
with dulicates and null values then dropped - As before, another temporary dataframe is created, merging olist_order_items_dataset and olist_products_dataset using an outer merge on the shared key
order_id
- The two previously created temporary tables are merged together on an outer merge on the shared key
seller_id
teal1PurpleRedGreyMerge = pd.merge(orderCustomers, purpleRedGreyMerge, how = "outer", left_on = 'customer_id', right_on = 'customer_id')
teal1PurpleRedGreyMerge = teal1PurpleRedGreyMerge.drop_duplicates(subset=['customer_id'])
teal1PurpleRedGreyMerge = teal1PurpleRedGreyMerge.dropna(subset=['customer_id'])
teal1PurpleRedGreyMerge
teal2BlueMerge = pd.merge(sellers, geolocation, how='outer', left_on = 'seller_zip_code_prefix', right_on = 'geolocation_zip_code_prefix')
teal2BlueMerge = teal2BlueMerge.drop_duplicates(subset=['seller_id'])
teal2BlueMerge = teal2BlueMerge.dropna(subset=['seller_id'])
teal2BlueMerge
seller_id | seller_zip_code_prefix | seller_city | seller_state | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
---|---|---|---|---|---|---|---|---|---|
0 | 3442f8959a84dea7ee197c632cb2df15 | 13023.0 | campinas | SP | 13023.0 | -22.898536 | -47.063125 | campinas | SP |
80 | e0eabded302882513ced4ea3eb0c7059 | 13023.0 | campinas | SP | 13023.0 | -22.898536 | -47.063125 | campinas | SP |
160 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844.0 | mogi guacu | SP | 13844.0 | -22.382941 | -46.946641 | mogi-guacu | SP |
263 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031.0 | rio de janeiro | RJ | 20031.0 | -22.910641 | -43.176510 | rio de janeiro | RJ |
650 | 1d2732ef8321502ee8488e8bed1ab8cd | 20031.0 | rio de janeiro | RJ | 20031.0 | -22.910641 | -43.176510 | rio de janeiro | RJ |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
434730 | f1fdf2d13186575751aa25876536d85c | 5314.0 | sao paulo | SP | 5314.0 | -23.534949 | -46.733916 | sao paulo | SP |
434792 | 98dddbc4601dd4443ca174359b237166 | 87111.0 | sarandi | PR | 87111.0 | -23.456431 | -51.866369 | sarandi | PR |
434860 | 74871d19219c7d518d0090283e03c137 | 4650.0 | sao paulo | SP | 4650.0 | -23.659845 | -46.677882 | sao paulo | SP |
434928 | e603cf3fec55f8697c9059638d6c8eb5 | 96080.0 | pelotas | RS | 96080.0 | -31.744231 | -52.328761 | pelotas | RS |
435024 | 9e25199f6ef7e7c347120ff175652c3b | 12051.0 | taubate | SP | 12051.0 | -23.016003 | -45.582021 | taubate | SP |
3095 rows × 9 columns
orangeYellowMerge = pd.merge(orderItems, products, how='outer', left_on = 'product_id', right_on = 'product_id')
orangeYellowMerge['product_category_name'].fillna(value="", inplace=True)
orangeYellowMerge.fillna(value=0, inplace=True)
orangeYellowMerge
order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.9 | 13.29 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 |
1 | 130898c0987d1801452a8ed92a670612 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-07-05 02:44:11 | 55.9 | 17.96 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 |
2 | 532ed5e14e24ae1f0d735b91524b98b9 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2018-05-23 10:56:25 | 64.9 | 18.33 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 |
3 | 6f8c31653edb8c83e1a739408b5ff750 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-08-07 18:55:08 | 58.9 | 16.17 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 |
4 | 7d19f4ef4d04461989632411b7e588b9 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-08-16 22:05:11 | 58.9 | 13.29 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
112645 | ffebd80e3291e811c308365936897efd | 1 | 4cc4d02efc8f249c13355147fb44e34d | d1c7fa84e48cfa21a0e595167c1c500e | 2018-07-18 20:23:55 | 129.9 | 51.20 | ferramentas_jardim | 37.0 | 653.0 | 1.0 | 6700.0 | 35.0 | 12.0 | 22.0 |
112646 | ffee31fb4b5e35c9123608015637c495 | 1 | b10ecf8e33aaaea419a9fa860ea80fb5 | 0241d4d5d36f10f80c644447315af0bd | 2018-08-21 10:10:11 | 99.0 | 13.52 | moveis_decoracao | 30.0 | 308.0 | 1.0 | 2300.0 | 37.0 | 30.0 | 20.0 |
112647 | fff7c4452f050315db1b3f24d9df5fcd | 1 | dd469c03ad67e201bc2179ef077dcd48 | 7e93a43ef30c4f03f38b393420bc753a | 2017-06-07 17:05:23 | 736.0 | 20.91 | relogios_presentes | 33.0 | 658.0 | 3.0 | 400.0 | 19.0 | 9.0 | 15.0 |
112648 | fffa82886406ccf10c7b4e35c4ff2788 | 1 | bbe7651fef80287a816ead73f065fc4b | 8f2ce03f928b567e3d56181ae20ae952 | 2017-12-22 17:31:42 | 229.9 | 44.02 | esporte_lazer | 32.0 | 280.0 | 2.0 | 2700.0 | 60.0 | 15.0 | 15.0 |
112649 | fffe41c64501cc87c801fd61db3f6244 | 1 | 350688d9dc1e75ff97be326363655e01 | f7ccf836d21b2fb1de37564105216cc1 | 2018-06-12 17:10:13 | 43.0 | 12.79 | cama_mesa_banho | 47.0 | 511.0 | 1.0 | 600.0 | 30.0 | 3.0 | 19.0 |
112650 rows × 15 columns
teal2BlueOrangeYellowMerge = pd.merge(orangeYellowMerge, teal2BlueMerge, how='outer', left_on = 'seller_id', right_on = 'seller_id')
teal2BlueOrangeYellowMerge = teal2BlueOrangeYellowMerge.drop(labels = ["seller_state", "geolocation_state", "geolocation_zip_code_prefix", "geolocation_city"], axis = 1)
4.3 | Final Merges & Drops
- Finally, the two created temporary tables are merged through an outer merge on the shared key
order_id
- For this analysis, many columns of ths final table are not needed and thus dropped
- This final table is then exported as a
.csv
file
finalMerge = pd.merge(teal2BlueOrangeYellowMerge, teal1PurpleRedGreyMerge, how='outer', left_on = "order_id", right_on = "order_id")
finalMerge.info()
finalMerge
<class 'pandas.core.frame.DataFrame'>
Int64Index: 113425 entries, 0 to 113424
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 113425 non-null object
1 order_item_id 112650 non-null float64
2 product_id 112650 non-null object
3 seller_id 112650 non-null object
4 shipping_limit_date 112650 non-null object
5 price 112650 non-null float64
6 freight_value 112650 non-null float64
7 product_category_name 112650 non-null object
8 product_name_lenght 112650 non-null float64
9 product_description_lenght 112650 non-null float64
10 product_photos_qty 112650 non-null float64
11 product_weight_g 112650 non-null float64
12 product_length_cm 112650 non-null float64
13 product_height_cm 112650 non-null float64
14 product_width_cm 112650 non-null float64
15 seller_zip_code_prefix 112650 non-null float64
16 seller_city 112650 non-null object
17 geolocation_lat 112397 non-null float64
18 geolocation_lng 112397 non-null float64
19 customer_id 113425 non-null object
20 customer_unique_id 113425 non-null object
21 customer_zip_code_prefix 113425 non-null int64
22 customer_city 113425 non-null object
23 customer_state 113425 non-null object
24 order_status 113425 non-null object
25 order_purchase_timestamp 113425 non-null object
26 order_approved_at 113264 non-null object
27 order_delivered_carrier_date 111457 non-null object
28 order_delivered_customer_date 110196 non-null object
29 order_estimated_delivery_date 113425 non-null object
30 review_id 112464 non-null object
31 review_score 113425 non-null float64
32 review_comment_title 13505 non-null object
33 review_comment_message 47928 non-null object
34 review_creation_date 112464 non-null object
35 review_answer_timestamp 112464 non-null object
36 payment_sequential 113422 non-null float64
37 payment_type 113422 non-null object
38 payment_installments 113422 non-null float64
39 payment_value 113422 non-null float64
dtypes: float64(17), int64(1), object(22)
memory usage: 35.5+ MB
order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_name_lenght | product_description_lenght | ... | review_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | payment_sequential | payment_type | payment_installments | payment_value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1.0 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.9 | 13.29 | cool_stuff | 58.0 | 598.0 | ... | 97ca439bc427b48bc1cd7177abe71365 | 5.0 | NaN | Perfeito, produto entregue antes do combinado. | 2017-09-21 00:00:00 | 2017-09-22 10:57:03 | 1.0 | credit_card | 2.0 | 72.19 |
1 | 130898c0987d1801452a8ed92a670612 | 1.0 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-07-05 02:44:11 | 55.9 | 17.96 | cool_stuff | 58.0 | 598.0 | ... | b11cba360bbe71410c291b764753d37f | 5.0 | NaN | lannister como sempre, entregou certinho e den... | 2017-07-14 00:00:00 | 2017-07-17 12:50:07 | 1.0 | boleto | 1.0 | 73.86 |
2 | 532ed5e14e24ae1f0d735b91524b98b9 | 1.0 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2018-05-23 10:56:25 | 64.9 | 18.33 | cool_stuff | 58.0 | 598.0 | ... | af01c4017c5ab46df6cc810e069e654a | 4.0 | super recomendo | carrinho muito bonito | 2018-06-05 00:00:00 | 2018-06-06 21:41:12 | 1.0 | credit_card | 2.0 | 83.23 |
3 | 6f8c31653edb8c83e1a739408b5ff750 | 1.0 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-08-07 18:55:08 | 58.9 | 16.17 | cool_stuff | 58.0 | 598.0 | ... | 8304ff37d8b16b57086fa283fe0c44f8 | 5.0 | NaN | NaN | 2017-08-10 00:00:00 | 2017-08-13 03:35:17 | 1.0 | credit_card | 3.0 | 75.07 |
4 | 7d19f4ef4d04461989632411b7e588b9 | 1.0 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-08-16 22:05:11 | 58.9 | 13.29 | cool_stuff | 58.0 | 598.0 | ... | 426f43a82185969503fb3c86241a9535 | 5.0 | NaN | NaN | 2017-08-25 00:00:00 | 2017-08-28 00:51:18 | 1.0 | credit_card | 4.0 | 72.19 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
113420 | 2f634e2cebf8c0283e7ef0989f77d217 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 44c6d194170df31929a4ab4b7ae8512a | 1.0 | NaN | Comprei um perfume Bleu de Chanel , paguei e n... | 2017-10-29 00:00:00 | 2017-10-29 16:40:21 | 1.0 | credit_card | 12.0 | 615.53 |
113421 | 2b0edc4c59d83dcef85466718c36a317 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 62729f1cf3b504674bb4043189c4416f | 2.0 | NaN | NaN | 2017-10-19 00:00:00 | 2017-10-19 02:37:06 | 1.0 | credit_card | 1.0 | 112.91 |
113422 | 81b7c7bbc8ec003eeb67d87441a6a148 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | abbfacb2964f74f6487c9c10ac46daa6 | 3.0 | NaN | NaN | 2018-08-15 00:00:00 | 2018-08-19 22:35:54 | 1.0 | voucher | 1.0 | 92.76 |
113423 | 8b08f0e729f58529ed03e763270f78d5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | d7cbf05434eb13d1fd5a0dd207c165ec | 1.0 | NaN | O produto estava anunciado no site, mas não ti... | 2017-09-29 00:00:00 | 2017-09-29 13:03:23 | 1.0 | credit_card | 1.0 | 136.50 |
113424 | 9b932dca249f9971b86dde2f2d7ad412 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 93514264c4caad00a9a2de8211661bd7 | 1.0 | NaN | NaN | 2017-12-09 00:00:00 | 2017-12-09 11:28:09 | 1.0 | credit_card | 1.0 | 66.42 |
113425 rows × 40 columns
finalMergedata = finalMerge.drop(['order_id', 'order_item_id', 'seller_id', 'shipping_limit_date',
'seller_zip_code_prefix', 'seller_city', 'customer_id', 'customer_zip_code_prefix', 'customer_city',
'customer_state', 'order_status', 'order_purchase_timestamp','review_id',
'order_purchase_timestamp', 'review_comment_title',
'review_comment_message', 'order_approved_at', 'review_creation_date',
'payment_type', 'payment_installments', 'order_delivered_carrier_date',
'order_delivered_customer_date', 'order_estimated_delivery_date',
'review_answer_timestamp', 'payment_sequential'], axis=1)
finalMergedata.to_csv(index=True)
5 | Imputing Missing Values & Train Test Split
5.1 | KNN Imputation
- Below the
sklearn.KNNImputer
package is used to imput missing numerical values from the final dataframe usingn=5
nearest neighbors - All remaining null or missing values are dropped
to_impute = finalMergedata[['product_name_lenght', 'product_description_length', 'product_photos_qty',
'geolocation_lat', 'geolocation_lng']]
imputer = KNNImputer(n_neighbors=5)
impute_fit = imputer.fit_transform(to_impute)
finalMergedata['product_category_name'].replace(r'^\s*$', np.nan, regex=True, inplace=True)
finalMergedata['product_category_name'].fillna(value="other", inplace= True)
finalMergedata[['product_name_lenght', 'product_description_lenght', 'product_photos_qty',
'geolocation_lat', 'geolocation_lng']] = impute_fit
finalMergedata = finalMergedata.dropna()
finalMergedata
product_id | price | freight_value | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | geolocation_lat | geolocation_lng | customer_unique_id | review_score | payment_value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4244733e06e7ecb4970a6e2683c13e61 | 58.9 | 13.29 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 | -22.498183 | -44.123614 | 871766c5855e863f6eccc05f988b23cb | 5.0 | 72.19 |
1 | 4244733e06e7ecb4970a6e2683c13e61 | 55.9 | 17.96 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 | -22.498183 | -44.123614 | 0fb8e3eab2d3e79d92bb3fffbb97f188 | 5.0 | 73.86 |
2 | 4244733e06e7ecb4970a6e2683c13e61 | 64.9 | 18.33 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 | -22.498183 | -44.123614 | 3419052c8c6b45daf79c1e426f9e9bcb | 4.0 | 83.23 |
3 | 4244733e06e7ecb4970a6e2683c13e61 | 58.9 | 16.17 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 | -22.498183 | -44.123614 | e7c828d22c0682c1565252deefbe334d | 5.0 | 75.07 |
4 | 4244733e06e7ecb4970a6e2683c13e61 | 58.9 | 13.29 | cool_stuff | 58.0 | 598.0 | 4.0 | 650.0 | 28.0 | 9.0 | 14.0 | -22.498183 | -44.123614 | 0bb98ba72dcc08e95f9d8cc434e9a2cc | 5.0 | 72.19 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
112645 | f739e9151702508b18f796c53005e5e9 | 119.9 | 16.14 | moveis_decoracao | 44.0 | 533.0 | 2.0 | 1000.0 | 69.0 | 11.0 | 11.0 | -25.450057 | -49.260674 | 3d50a44231c2a153219cef76ee3e445d | 4.0 | 136.04 |
112646 | f4135cbdece8245560f7be179533797a | 209.0 | 16.21 | eletronicos | 32.0 | 1306.0 | 3.0 | 500.0 | 17.0 | 9.0 | 25.0 | -25.379062 | -49.227004 | 52a953541861bbb4b79b39df24e00243 | 4.0 | 225.21 |
112647 | 69c590f7ffc7bf8db97190b6cb6ed62e | 6729.0 | 193.21 | pcs | 50.0 | 1935.0 | 4.0 | 5660.0 | 54.0 | 18.0 | 47.0 | -23.306545 | -51.153471 | 459bef486812aa25204be022145caa62 | 3.5 | 6922.21 |
112648 | 5ff4076c0f01eeba4f728c9e3fa2653c | 27.9 | 14.44 | bebidas | 28.0 | 242.0 | 1.0 | 2000.0 | 19.0 | 38.0 | 19.0 | -23.567864 | -46.592424 | ddb2249e0316d365ceae561c0f011bce | 5.0 | 84.68 |
112649 | 5ff4076c0f01eeba4f728c9e3fa2653c | 27.9 | 14.44 | bebidas | 28.0 | 242.0 | 1.0 | 2000.0 | 19.0 | 38.0 | 19.0 | -23.567864 | -46.592424 | ddb2249e0316d365ceae561c0f011bce | 5.0 | 84.68 |
112647 rows × 16 columns
finalMergedata[finalMergedata['product_category_name'].isin(og_name_to_idx.keys())]
5.2 | Label Encoding Non-Numeric Values
- the
LabelEncoder
class from thesklearn.preprocessing
module is used to encode non-numeric values - Predictors (in the form of customer IDs) and classes (in the form of a matrix of product information) is created to be used in the model
training_data = finalMergedata.drop('customer_unique_id')
training_data = training_data.groupby(['product_id', 'product_category_name']).mean().reset_index()
training_data
product_id_le = preprocessing.LabelEncoder()
product_id_le.fit(training_data['product_id'].to_numpy())
training_data['product_id'] = product_id_le.transform(training_data['product_id'].to_numpy())
cat_le = preprocessing.LabelEncoder()
cat_le.fit(training_data['product_category_name'].to_numpy())
training_data['product_category_name'] = cat_le.transform(training_data['product_category_name'].to_numpy())
training_data
len(pd.unique(training_data['product_id']))
y = training_data.iloc[:, 0].to_numpy()
X = training_data.iloc[:, 1:].to_numpy()
X, y
6 | KNN Classifier Model & Recommender System
6.1 | k-Nearest Neighbor Model
- The basic outline for a KNN classifier is presented, with a
metric
parameter modifying the distances of each product (and associated product category), increasing the distance for an object with less cosine similarity, and decreasing the distance for two products with categories of higher cosine similarity
weights = np.array([20000000, 40, 30, 1, 1, 20, 10, 10, 10, 10, 30, 30, 1, 10])
def metric(u, v):
cat_name_u = cat_le.inverse_transform([int(u[0])])
cat_name_v = cat_le.inverse_transform([int(v[0])])
u_v = u - v
u_v = np.multiply(u_v, weights)
u_v[0] = 20000000 - similarities_df[og_name_to_idx[cat_name_u[0]]][og_name_to_idx[cat_name_v[0]]]
u_v[11] = 5 - u_v[11] * 40
dist = norm(u_v)
return dist
neigh = KNeighborsClassifier(n_neighbors=4, metric=metric)
neigh.fit(X, y)
probs = neigh.predict_proba([[63,101.65,18.590,53.0,596.0,6.0,300.0,20.0,16.0,16.0,-23.537922,-46.477696,5.0,120.240]])
probs
ind = np.argpartition(probs[0], -4)[-4:]
ind
display(training_data[(finalMergedata['product_id'] == 9281) |
(finalMergedata['product_id'] == 18733) |
(finalMergedata['product_id'] == 29589) |
(finalMergedata['product_id'] == 0)])
6.2 | Recommender System
- Finally, everything is put together to obtain a recommendation based on the specific parameters of an individual customer which is implemented in the
get_recommendation
function - As per the request of this datathon, only the top 3 recommendations are returned
- Another function is defined,
get_recommendation_for_group
allowing the input of a group of customer IDs for tailored recommendations to each customer
customer_order_info = finalMergedata
customer_order_info = customer_order_info.dropna()
customer_order_info
# input 1 customer id, get a list of recommendation, return none is customer has no previous order
def get_recommendation(customer_id):
specific_customer_order_info = customer_order_info[customer_order_info['customer_unique_id'] == customer_id]
# use product with review >= 3
specific_customer_order_info = specific_customer_order_info[specific_customer_order_info['review_score'] >= 3]
# sort by review score, break ties with most recent purcahse
specific_customer_order_info = specific_customer_order_info.sort_values(['review_score', 'order_purchase_timestamp'], ascending=False)
# drop extra info we don't need and encode
specific_customer_order_info = specific_customer_order_info.drop(['customer_unique_id', 'order_purchase_timestamp'], axis=1)
specific_customer_order_info['product_id'] = product_id_le.transform(specific_customer_order_info['product_id'].to_numpy())
specific_customer_order_info['product_category_name'] = cat_le.transform(specific_customer_order_info['product_category_name'].to_numpy())
specific_customer_order_info = specific_customer_order_info.reindex(finalMergedata.columns, axis=1)
purchased_product = specific_customer_order_info['product_id']
# use top 3
index = 4
probs = neigh.predict_proba(specific_customer_order_info.iloc[:3, 1:].to_numpy())
probs = np.sum(probs, axis=0)
probs[purchased_product.to_numpy()] = 0
ind = np.argpartition(probs, -5)[-5:]
while (probs[ind] > 0).sum() < 5 and index < specific_customer_order_info.shape[0]:
probs = neigh.predict_proba(specific_customer_order_info.iloc[:index, 1:].to_numpy())
probs = np.sum(probs, axis=0)
probs[purchased_product.to_numpy()] = 0
ind = np.argpartition(probs, -5)[-5:]
index += 1
return product_id_le.inverse_transform(ind)
recommend = get_recommendation('290c77bc529b7ac935b93aa66c333dc3')
display(customer_order_info[customer_order_info['customer_unique_id'] == '290c77bc529b7ac935b93aa66c333dc3'])
customer_order_info[customer_order_info['product_id'].isin(recommend)]
def get_recommendation_for_group(customer_ids):
specific_customer_order_info = customer_order_info[customer_order_info['customer_unique_id'].isin(customer_ids)]
bought = specific_customer_order_info['product_id']
recommendation = np.array([])
for id in customer_ids:
rec = get_recommendation(id)
print(id, rec)
recommendation = np.append(recommendation, get_recommendation(id))
recommendation = np.unique(recommendation)
# filter out bought items
recommendation = recommendation[np.logical_not(np.isin(recommendation, bought.to_numpy()))]
return recommendation
customer_ids = ['290c77bc529b7ac935b93aa66c333dc3', '0fb8e3eab2d3e79d92bb3fffbb97f188', '3419052c8c6b45daf79c1e426f9e9bcb']
recommendation = get_recommendation_for_group(customer_ids)
recommendation
display(customer_order_info[customer_order_info['customer_unique_id'].isin(customer_ids)])
products[products['product_id'].isin(recommendation)]