File size: 1,880 Bytes
883557f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np
from scipy.sparse import csr_matrix

"""

Function to find similar project for the single project matching



Single Project Matching empowers you to choose an individual project using 

either the project IATI ID or title, and then unveils the top x projects within a filter (filtered_df) that 

bear the closest resemblance to your selected one (p_index).

"""

def find_similar(p_index, similarity_matrix, filtered_df, top_x):
    """

    p_index: index of selected project

    similarity_matrix: matrix with similarities of all projects

    filtered_df: df with filter applied

    top_x: top x project which should be displayed

    """

    # convert npz sparse matrix into csr matrix
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)
    
    # filter out just projects from filtered_df
    filtered_indices =  filtered_df.index.tolist()
    filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]

    # create a mapping from new position to original indices
    index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}

    # select just the row of th similarity matrix of the selected project index
    project_row = filtered_column_sim_matrix.getrow(p_index).toarray().ravel()

    # find top_x indices with the highest similarity scores in the row
    sorted_indices = np.argsort(project_row)[-top_x:][::-1]
    top_indices = [index_position_mapping[i] for i in sorted_indices]
    top_values = project_row[sorted_indices]

    # create result df with all top_x similar projects
    result_df = filtered_df.loc[top_indices]
    result_df['similarity'] = top_values

    # filter out rows with similarity score less than 30
    result_df = result_df[result_df['similarity'] > 0]

    return result_df