1
1
import numpy as np
2
2
import pandas as pd
3
3
4
- from api .openai_api_helper import get_embedding
4
+ from core .openai_api_helper import get_embedding
5
5
6
6
def cosine_similarity (a , b ):
7
7
if len (a ) > len (b ):
@@ -10,22 +10,20 @@ def cosine_similarity(a, b):
10
10
a = np .pad (a , (0 , len (b ) - len (a )), 'constant' )
11
11
return np .dot (a , b ) / (np .linalg .norm (a ) * np .linalg .norm (b ))
12
12
13
- def get_similarity_dataframe (query : str , dataset : pd .core .frame .DataFrame , rows : int ) -> pd .core .frame .DataFrame :
14
- SIMILARITIES_RESULTS_THRESHOLD = 0.70
15
-
13
+ def get_similarity_df (query , dataset , rows , similarities_results_threshold , embedding_model ) -> pd .core .frame .DataFrame :
16
14
# create a copy of the dataset
17
15
dataset_vectors = dataset .copy ()
18
16
19
17
# get the embeddings for the query
20
- query_embeddings = get_embedding (query )
18
+ query_embeddings = get_embedding (query , embedding_model )
21
19
22
20
# create a new column with the calculated similarity for each row
23
21
dataset_vectors ["similarity" ] = dataset_vectors ["Embedding" ].apply (
24
22
lambda x : cosine_similarity (np .array (query_embeddings ), np .array (x ))
25
23
)
26
24
27
25
# filter the videos by similarity
28
- mask = dataset_vectors ["similarity" ] >= SIMILARITIES_RESULTS_THRESHOLD
26
+ mask = dataset_vectors ["similarity" ] >= similarities_results_threshold
29
27
dataset_vectors = dataset_vectors [mask ].copy ()
30
28
31
29
# sort the videos by similarity
0 commit comments