Skip to content

add similarity_search.py in machine_learning #3864

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 13, 2020
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions machine_learning/similarity_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Simularity search is a search algorithm for finding the nearest vector from
vectors, used in natural language processing.
In this algorithm, it calculates distance with euclidean distance and
returns a list containing two data for each vector:
1. the nearest vector
2. distance between the vector and the nearest vector
"""
import math
from typing import Union

import numpy as np

InputVal = Union[int, float, np.ndarray]


def euclidean(input_a: InputVal, input_b: InputVal):
"""
Calculates euclidean distance between two data. The result should be float.
>>> euclidean(0, 1)
1.0
>>> euclidean(np.array([0, 1]), np.array([1, 1]))
1.0
>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
1.0
"""
dist = 0

if type(input_a) == type(input_b):
if type(input_a) != np.ndarray:
dist = pow(input_a - input_b, 2)
else:
for index in range(len(input_a)):
dist += pow(input_a[index] - input_b[index], 2)
return math.sqrt(dist)
return None


def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def similarity_search(dataset: np.ndarray, value: np.ndarray) -> list:
def similarity_search(dataset: np.ndarray, value_array: np.ndarray) -> list:

This is not a single value but an array of values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed!

"""
:param dataset: Set containing the vectors.
:param value: vector/vectors we want to know the nearest vector from dataset.
Result will be a list containing 1. the nearest vector, 2. distance from the vector
>>> a = np.array([0, 1, 2])
>>> b = np.array([0])
>>> similarity_search(a, b)
[[0, 0.0]]

>>> a = np.array([[0, 0], [1, 1], [2, 2]])
>>> b = np.array([[0, 1]])
>>> similarity_search(a, b)
[[[0, 0], 1.0]]

>>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
>>> b = np.array([[0, 0, 1]])
>>> similarity_search(a, b)
[[[0, 0, 0], 1.0]]
>>> a = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
>>> b = np.array([[0, 0, 0], [0, 0, 1]])
>>> similarity_search(a, b)
[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
"""

if dataset.ndim != value.ndim:
raise TypeError(
"Wrong input data's dimensions... dataset : ",
dataset.ndim,
", value : ",
value.ndim,
)

try:
if dataset.shape[1] != value.shape[1]:
raise TypeError(
"Wrong input data's shape... dataset : ",
dataset.shape[1],
", value : ",
value.shape[1],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

f-string

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

)
except IndexError:
if (dataset.ndim == value.ndim) != 1:
raise TypeError("Wrong type")

if dataset.dtype != value.dtype:
raise TypeError(
"Input data have different datatype... dataset : ",
dataset.dtype,
", value : ",
value.dtype,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

f-string

)

answer = []

for index in range(len(value)):
dist = euclidean(value[index], dataset[0])
vector = dataset[0].tolist()

for index2 in range(1, len(dataset)):
temp_dist = euclidean(value[index], dataset[index2])

if dist > temp_dist:
dist = temp_dist
vector = dataset[index2].tolist()

answer.append([vector, dist])

return answer


if __name__ == "__main__":
import doctest

doctest.testmod()