Skip to content

Commit f783cfd

Browse files
authored
Fixes #902 (#1632)
* fix probability out of bound * fixed probability out of bound * cleared the notebook output * fix of probabilities out of bound
1 parent 181f8d6 commit f783cfd

File tree

2 files changed

+18
-0
lines changed

2 files changed

+18
-0
lines changed

introduction_to_amazon_algorithms/lda_topic_modeling/generate_example_data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
6767
vocabulary_size = 25
6868
image_dim = np.int(np.sqrt(vocabulary_size))
6969

70+
# to be used for numercial stability
71+
epsilon = np.finfo(float).eps
72+
7073
# perform checks on input
7174
assert num_topics in [5,10], 'Example data only available for 5 or 10 topics'
7275
if alpha:
@@ -89,18 +92,23 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
8992
# create the col topics. when num_topics = 10 add the row topics as well
9093
#
9194
beta = np.zeros((num_topics,image_dim,image_dim), dtype=np.float)
95+
96+
9297
for i in range(image_dim):
9398
beta[i,:,i] = dirichlet_eta.rvs(size=1)
9499
if num_topics == 10:
95100
for i in range(image_dim):
96101
beta[i+image_dim,i,:] = dirichlet_eta.rvs(size=1)
97102
beta.resize(num_topics, vocabulary_size)
103+
# normalize beta to ensure each row is a valid probability dist
104+
beta /= (1 + epsilon)
98105

99106
# generate documents using the LDA model / provess
100107
#
101108
document_lengths = sp.stats.poisson(average_document_length).rvs(size=num_documents)
102109
documents = np.zeros((num_documents,vocabulary_size), dtype=np.float)
103110
thetas = dirichlet_alpha.rvs(size=num_documents) # precompute topic distributions for performance
111+
thetas /=(1+epsilon)
104112
for m in range(num_documents):
105113
document_length = document_lengths[m]
106114
theta = thetas[m]
@@ -193,3 +201,5 @@ def plot_lda_topics(documents, nrows, ncols, with_colorbar=True,
193201
vmin=vmin, vmax=vmax)
194202

195203
return fig
204+
205+

scientific_details_of_algorithms/lda_topic_modeling/generate_example_data.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
6666
"""
6767
vocabulary_size = 25
6868
image_dim = np.int(np.sqrt(vocabulary_size))
69+
70+
# to be used for numercial stability
71+
epsilon = np.finfo(float).eps
6972

7073
# perform checks on input
7174
assert num_topics in [5,10], 'Example data only available for 5 or 10 topics'
@@ -98,9 +101,14 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
98101

99102
# generate documents using the LDA model / provess
100103
#
104+
# normalize beta to ensure each row is a valid probability dist
105+
beta /= (1 + epsilon)
106+
101107
document_lengths = sp.stats.poisson(average_document_length).rvs(size=num_documents)
102108
documents = np.zeros((num_documents,vocabulary_size), dtype=np.float)
103109
thetas = dirichlet_alpha.rvs(size=num_documents) # precompute topic distributions for performance
110+
thetas /= (1 + epsilon)
111+
104112
for m in range(num_documents):
105113
document_length = document_lengths[m]
106114
theta = thetas[m]

0 commit comments

Comments
 (0)