@@ -67,6 +67,9 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
67
67
vocabulary_size = 25
68
68
image_dim = np .int (np .sqrt (vocabulary_size ))
69
69
70
+ # to be used for numercial stability
71
+ epsilon = np .finfo (float ).eps
72
+
70
73
# perform checks on input
71
74
assert num_topics in [5 ,10 ], 'Example data only available for 5 or 10 topics'
72
75
if alpha :
@@ -89,18 +92,23 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
89
92
# create the col topics. when num_topics = 10 add the row topics as well
90
93
#
91
94
beta = np .zeros ((num_topics ,image_dim ,image_dim ), dtype = np .float )
95
+
96
+
92
97
for i in range (image_dim ):
93
98
beta [i ,:,i ] = dirichlet_eta .rvs (size = 1 )
94
99
if num_topics == 10 :
95
100
for i in range (image_dim ):
96
101
beta [i + image_dim ,i ,:] = dirichlet_eta .rvs (size = 1 )
97
102
beta .resize (num_topics , vocabulary_size )
103
+ # normalize beta to ensure each row is a valid probability dist
104
+ beta /= (1 + epsilon )
98
105
99
106
# generate documents using the LDA model / provess
100
107
#
101
108
document_lengths = sp .stats .poisson (average_document_length ).rvs (size = num_documents )
102
109
documents = np .zeros ((num_documents ,vocabulary_size ), dtype = np .float )
103
110
thetas = dirichlet_alpha .rvs (size = num_documents ) # precompute topic distributions for performance
111
+ thetas /= (1 + epsilon )
104
112
for m in range (num_documents ):
105
113
document_length = document_lengths [m ]
106
114
theta = thetas [m ]
@@ -193,3 +201,5 @@ def plot_lda_topics(documents, nrows, ncols, with_colorbar=True,
193
201
vmin = vmin , vmax = vmax )
194
202
195
203
return fig
204
+
205
+
0 commit comments