File size: 3,837 Bytes
f1b477a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

from dotenv import load_dotenv

load_dotenv(dotenv_path="../env_vars.env")

import json
from pathlib import Path
import os

from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import DeepLake

from utils import clean_emotions_json


"""
This function takes all the songs we have and use the lyrics to create a list of 8 emotions.
These 8 emotions will then be used for similarity matching with user prompt emotions, instead of using the entire lyrics.
"""
def generate_emotion_from_lyrics(input_file:str, prompt_path:str, output_file:str, clean_output=True) -> None:
    prompt = PromptTemplate(
        input_variables=["song_lyrics"],
        template=Path(prompt_path).read_text(),
    )
    llm = ChatOpenAI(temperature=0.8)
    chain = LLMChain(llm=llm, prompt=prompt)

    # Read file that has scraped lyrics.
    with open(input_file, "r") as f:
        lyrical_data = json.load(f)
    '''
        'song' looks like as follows
        {
            "song_name": "Cruel Summer",
            "iframe": "<iframe style=\"border-radius: 12px\" width=\"100%\" height=\"152\" title=\"Spotify Embed:...
            "lyrics": "Fever dream high in the quiet of the nightYou know that I caught it Bad... 
        }
    '''
    # Collect 8 common emotions conveyed in the songs using their lyrics
    emotion_data = []
    for song in lyrical_data:
        print(f"{song['song_name']}")
        emotions = chain.run(song_lyrics=song["lyrics"])
        emotion_data.append(
            {
                "song_name": song["song_name"],
                "iframe": song["iframe"],
                "emotions": emotions
            })
        print(emotions)

    # Write to output file which will be used to store the song emotions as embeddings
    with open(output_file, "w") as f:
        json.dump(emotion_data, f, indent=4)

    print(f"Spotify song, url and song emotions saved to {output_file}")

    # Clean the generated emotions
    if clean_output:
        clean_emotions_json("../data/spotify_song_url_emotions.json")


def create_db(dataset_path: str, input_file: str) -> DeepLake:
    with open(input_file, "r") as f:
        emotion_data = json.load(f)

    texts = []
    metadatas = []

    '''
    {
        "song_name": "Mastermind",
        "iframe": "<iframe style=\"border-radius: 12px\" width=\"100%\" height=\"152\" title=\"Spotify Embed: 
        "emotions": "excitement, happiness, love, desire, confidence, determination, vulnerability, manipulation"
    }
    '''
    for song in emotion_data:
        texts.append(song["emotions"])
        metadatas.append(
            {
                "name": song["song_name"],
                "iframe": song["iframe"],
            }
        )

    embeddings = OpenAIEmbeddings(model=os.environ['MODEL'])

    db = DeepLake.from_texts(
        texts, embeddings, metadatas=metadatas, dataset_path=dataset_path
    )

    return db

def create_emotion_embeddings(input_file:str) ->None:
    dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_ID']}/{os.environ['DATASET']}"
    create_db(dataset_path,  input_file)

if __name__ == "__main__":

    # Get top 8 emotions for each song lyric that was scraped, using GPT 3.5 Turbo
    prompt_path = "../prompts/get_song_emotions.prompt"
    input_file = "../data/spotify_song_url_lyrics.json"
    output_file = "../data/spotify_song_url_emotions.json"
    generate_emotion_from_lyrics(input_file, prompt_path, output_file, clean_output=True)

    # Convert the generated emotions to embeddings (using 'text-embedding-ada-002' model) and save them in a vector store(DeepLake)
    input_file = "../data/spotify_song_url_emotions.json"
    create_emotion_embeddings(input_file)