|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.manifold import TSNE |
|
from scipy.spatial.distance import cdist |
|
|
|
st.markdown('<style>h1{color: white;}</style>', unsafe_allow_html=True) |
|
st.title('Call on Doc Skin Care Product Recommender') |
|
st.write('Find the Right Skin Care for you') |
|
|
|
st.write("Hi there! If you have a skincare product you currently like I can help you find a similar one based on the ingredients.") |
|
|
|
st.write('Please select a product below so I can recommend similar ones') |
|
|
|
df = pd.read_csv("./data/cosmetics.csv") |
|
|
|
|
|
category = st.selectbox(label='Select a product category', options= df['Label'].unique() ) |
|
category_subset = df[df['Label'] == category] |
|
|
|
brand = st.selectbox(label='Select a brand', options= sorted(category_subset['Brand'].unique())) |
|
category_brand_subset = category_subset[category_subset['Brand'] == brand] |
|
|
|
product = st.selectbox(label='Select the product', options= sorted(category_brand_subset['Name'].unique() )) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def oh_encoder(tokens): |
|
x = np.zeros(N) |
|
for ingredient in tokens: |
|
|
|
idx = ingredient_idx[ingredient] |
|
|
|
x[idx] = 1 |
|
return x |
|
|
|
def closest_point(point, points): |
|
""" Find closest point from a list of points. """ |
|
return points[cdist([point], points).argmin()] |
|
|
|
|
|
if category is not None: |
|
category_subset = df[df['Label'] == category] |
|
|
|
if product is not None: |
|
|
|
|
|
|
|
category_subset = category_subset.reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
ingredient_idx = {} |
|
corpus = [] |
|
idx = 0 |
|
|
|
|
|
for i in range(len(category_subset)): |
|
ingredients = category_subset['Ingredients'][i] |
|
ingredients_lower = ingredients.lower() |
|
tokens = ingredients_lower.split(', ') |
|
corpus.append(tokens) |
|
for ingredient in tokens: |
|
if ingredient not in ingredient_idx: |
|
ingredient_idx[ingredient] = idx |
|
idx += 1 |
|
|
|
|
|
|
|
M = len(category_subset) |
|
N = len(ingredient_idx) |
|
|
|
|
|
A = np.zeros((M,N)) |
|
|
|
|
|
i = 0 |
|
for tokens in corpus: |
|
A[i, :] = oh_encoder(tokens) |
|
i +=1 |
|
|
|
model_run = st.button('Find similar products!') |
|
|
|
|
|
if model_run: |
|
|
|
st.write('Based on the ingredients of the product you selected') |
|
st.write('here are the top 10 products that are the most similar :sparkles:') |
|
|
|
|
|
model = TSNE(n_components = 2, learning_rate = 150, random_state = 42) |
|
tsne_features = model.fit_transform(A) |
|
|
|
|
|
category_subset['X'] = tsne_features[:, 0] |
|
category_subset['Y'] = tsne_features[:, 1] |
|
|
|
target = category_subset[category_subset['Name'] == product] |
|
|
|
target_x = target['X'].values[0] |
|
target_y = target['Y'].values[0] |
|
|
|
df1 = pd.DataFrame() |
|
df1['point'] = [(x, y) for x,y in zip(category_subset['X'], category_subset['Y'])] |
|
|
|
category_subset['distance'] = [cdist(np.array([[target_x,target_y]]), np.array([product]), metric='euclidean') for product in df1['point']] |
|
|
|
|
|
top_matches = category_subset.sort_values(by=['distance']) |
|
|
|
|
|
target_ingredients = target.Ingredients.values |
|
c1_list = target_ingredients[0].split(",") |
|
c1_list = [x.strip(' ') for x in c1_list] |
|
c1_set = set(c1_list) |
|
|
|
top_matches['Ingredients in common'] = [c1_set.intersection( set([x.strip(' ')for x in product.split(",")]) ) for product in top_matches['Ingredients']] |
|
|
|
|
|
top_matches = top_matches[['Label', 'Brand', 'Name', 'Price', 'Ingredients','Ingredients in common']] |
|
top_matches = top_matches.reset_index(drop=True) |
|
top_matches = top_matches.drop(top_matches.index[0]) |
|
|
|
st.dataframe(top_matches.head(10)) |
|
|
|
|
|
|