Add document-topic plotting function

Chris Swierczewski · Chris Swierczewski · commit 564fbd11290b · 2017-11-25T00:34:30.000Z
diff --git a/lda_topic_modeling/generate_example_data.py b/lda_topic_modeling/generate_example_data.py
@@ -5,6 +5,8 @@
 import scipy as sp
 import scipy.stats
 
+from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
+
 def generate_griffiths_data(num_documents=5000, average_document_length=150,
                             num_topics=5, alpha=None, eta=None, seed=0):
     """Returns example documents from Griffiths-Steyvers [1].
@@ -46,7 +48,7 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
     theta : Numpy NDArray
         A matrix of size `num_documents` x `num_topics` equal to the topic
         mixtures used to generate the output `documents`.
-    
+
     References
     ----------
     [1] Thomas L Griffiths and Mark Steyvers. "Finding Scientific Topics."
@@ -56,7 +58,7 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
     """
     vocabulary_size = 25
     image_dim = np.int(np.sqrt(vocabulary_size))
-    
+
     # perform checks on input
     assert num_topics in [5,10], 'Example data only available for 5 or 10 topics'
     if alpha:
@@ -75,7 +77,7 @@ def generate_griffiths_data(num_documents=5000, average_document_length=150,
     dirichlet_eta = sp.stats.dirichlet(eta)
 
     # initialize a known topic-word distribution (beta) using eta. these are
-    # the "row" and "column" topics, respectively. when num_topics = 5 only 
+    # the "row" and "column" topics, respectively. when num_topics = 5 only
     # create the col topics. when num_topics = 10 add the row topics as well
     #
     beta = np.zeros((num_topics,image_dim,image_dim), dtype=np.float)
@@ -111,22 +113,22 @@ def plot_lda(data, nrows, ncols, with_colorbar=True, cmap=cm.viridis):
     fig, ax = plt.subplots(nrows, ncols, figsize=(ncols,nrows))
     vmin = 0
     vmax = data.max()
-    
+
     V = len(data[0])
     n = int(np.sqrt(V))
     for i in range(nrows):
         for j in range(ncols):
             index = i*ncols + j
-            
+
             if nrows > 1:
                 im = ax[i,j].matshow(data[index].reshape(n,n), cmap=cmap, vmin=vmin, vmax=vmax)
             else:
                 im = ax[j].matshow(data[index].reshape(n,n), cmap=cmap, vmin=vmin, vmax=vmax)
-                
+
     for axi in ax.ravel():
         axi.set_xticks([])
         axi.set_yticks([])
-        
+
     if with_colorbar:
         fig.colorbar(im, ax=ax.ravel().tolist(), orientation='horizontal', fraction=0.2)
     return fig
@@ -136,18 +138,50 @@ def match_estimated_topics(topics_known, topics_estimated):
     K, V = topics_known.shape
     permutation = -1*np.ones(K, dtype=np.int)
     unmatched_estimated_topics = []
-    
+
     for estimated_topic_index, t in enumerate(topics_estimated):
         matched_known_topic_index = np.argmin([np.linalg.norm(known_topic - t) for known_topic in topics_known])
         if permutation[matched_known_topic_index] == -1:
             permutation[matched_known_topic_index] = estimated_topic_index
         else:
             unmatched_estimated_topics.append(estimated_topic_index)
-            
+
     for estimated_topic_index in unmatched_estimated_topics:
         for i in range(K):
             if permutation[i] == -1:
                 permutation[i] = estimated_topic_index
                 break
-                
-    return permutation, (topics_estimated[permutation,:]).copy()
+
+    return permutation, (topics_estimated[permutation,:]).copy()
+
+def _document_with_topic(fig, gsi, index, document, topic_mixture=None,
+                         vmin=0, vmax=32):
+    ax_doc = fig.add_subplot(gsi[:5,:])
+    ax_doc.matshow(document.reshape(5,5), cmap='gray_r',
+                   vmin=vmin, vmax=vmax)
+    ax_doc.set_xticks([])
+    ax_doc.set_yticks([])
+
+    if topic_mixture is not None:
+        ax_topic = plt.subplot(gsi[-1,:])
+        ax_topic.matshow(topic_mixture.reshape(1,-1), cmap='Reds',
+                         vmin=0, vmax=1)
+        ax_topic.set_xticks([])
+        ax_topic.set_yticks([])
+
+def plot_lda_topics(documents, nrows, ncols, with_colorbar=True,
+                    topic_mixtures=None, cmap='Viridis', dpi=160):
+    fig = plt.figure()
+    gs = GridSpec(nrows, ncols)
+
+    vmin, vmax = (0, documents.max())
+
+    for i in range(nrows):
+        for j in range(ncols):
+            index = i*ncols + j
+            gsi = GridSpecFromSubplotSpec(6, 5, subplot_spec=gs[i,j])
+            _document_with_topic(fig, gsi, index, documents[index],
+                                 topic_mixture=topic_mixtures[index],
+                                 vmin=vmin, vmax=vmax)
+
+    return fig