Additional methods#

This notebooks provides an overview of built-in clustering performance evaluation, ways of accessing individual labels resulting from clustering and saving the object to disk.

Clustering performance evaluation#

Clustergam includes handy wrappers around a selection of clustering performance metrics offered by scikit-learn. Data which were originally computed on GPU are converted to numpy on the fly.

Let’s load the data and fit clustergram on Palmer penguins dataset. See the Introduction for its overview.

import seaborn
from sklearn.preprocessing import scale
from clustergram import Clustergram

seaborn.set(style='whitegrid')

df = seaborn.load_dataset('penguins')
data = scale(df.drop(columns=['species', 'island', 'sex']).dropna())

cgram = Clustergram(range(1, 12), n_init=10, verbose=False)
cgram.fit(data)
Matplotlib is building the font cache; this may take a moment.
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[1], line 1
----> 1 import seaborn
      2 from sklearn.preprocessing import scale
      3 from clustergram import Clustergram

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/__init__.py:2
      1 # Import seaborn objects
----> 2 from .rcmod import *  # noqa: F401,F403
      3 from .utils import *  # noqa: F401,F403
      4 from .palettes import *  # noqa: F401,F403

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/rcmod.py:5
      3 import matplotlib as mpl
      4 from cycler import cycler
----> 5 from . import palettes
      8 __all__ = ["set_theme", "set", "reset_defaults", "reset_orig",
      9            "axes_style", "set_style", "plotting_context", "set_context",
     10            "set_palette"]
     13 _style_keys = [
     14 
     15     "axes.facecolor",
   (...)
     50 
     51 ]

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/palettes.py:9
      5 import matplotlib as mpl
      7 from .external import husl
----> 9 from .utils import desaturate, get_color_cycle
     10 from .colors import xkcd_rgb, crayons
     11 from ._compat import get_colormap

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/utils.py:14
     12 import matplotlib as mpl
     13 from matplotlib.colors import to_rgb
---> 14 import matplotlib.pyplot as plt
     15 from matplotlib.cbook import normalize_kwargs
     17 from seaborn._core.typing import deprecated

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/pyplot.py:56
     54 from cycler import cycler
     55 import matplotlib
---> 56 import matplotlib.colorbar
     57 import matplotlib.image
     58 from matplotlib import _api

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/colorbar.py:19
     16 import numpy as np
     18 import matplotlib as mpl
---> 19 from matplotlib import _api, cbook, collections, cm, colors, contour, ticker
     20 import matplotlib.artist as martist
     21 import matplotlib.patches as mpatches

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/contour.py:15
     13 import matplotlib as mpl
     14 from matplotlib import _api, _docstring
---> 15 from matplotlib.backend_bases import MouseButton
     16 from matplotlib.lines import Line2D
     17 from matplotlib.path import Path

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/backend_bases.py:46
     43 import numpy as np
     45 import matplotlib as mpl
---> 46 from matplotlib import (
     47     _api, backend_tools as tools, cbook, colors, _docstring, text,
     48     _tight_bbox, transforms, widgets, is_interactive, rcParams)
     49 from matplotlib._pylab_helpers import Gcf
     50 from matplotlib.backend_managers import ToolManager

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/text.py:16
     14 from . import _api, artist, cbook, _docstring
     15 from .artist import Artist
---> 16 from .font_manager import FontProperties
     17 from .patches import FancyArrowPatch, FancyBboxPatch, Rectangle
     18 from .textpath import TextPath, TextToPath  # noqa # Logically located here

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:1582
   1578     _log.info("generated new fontManager")
   1579     return fm
-> 1582 fontManager = _load_fontmanager()
   1583 findfont = fontManager.findfont
   1584 get_font_names = fontManager.get_font_names

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:1576, in _load_fontmanager(try_read_cache)
   1574             _log.debug("Using fontManager instance from %s", fm_path)
   1575             return fm
-> 1576 fm = FontManager()
   1577 json_dump(fm, fm_path)
   1578 _log.info("generated new fontManager")

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:1041, in FontManager.__init__(self, size, weight)
   1038 try:
   1039     for fontext in ["afm", "ttf"]:
   1040         for path in [*findSystemFonts(paths, fontext=fontext),
-> 1041                      *findSystemFonts(fontext=fontext)]:
   1042             try:
   1043                 self.addfont(path)

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:280, in findSystemFonts(fontpaths, fontext)
    278     fontpaths = []
    279 else:
--> 280     installed_fonts = _get_fontconfig_fonts()
    281     if sys.platform == 'darwin':
    282         fontpaths = [*X11FontDirectories, *OSXFontDirectories]

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:258, in _get_fontconfig_fonts()
    255         _log.warning(  # fontconfig 2.7 implemented --format.
    256             'Matplotlib needs fontconfig>=2.7 to query system fonts.')
    257         return []
--> 258     out = subprocess.check_output(['fc-list', '--format=%{file}\\n'])
    259 except (OSError, subprocess.CalledProcessError):
    260     return []

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/subprocess.py:466, in check_output(timeout, *popenargs, **kwargs)
    463         empty = b''
    464     kwargs['input'] = empty
--> 466 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
    467            **kwargs).stdout

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/subprocess.py:550, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
    548 with Popen(*popenargs, **kwargs) as process:
    549     try:
--> 550         stdout, stderr = process.communicate(input, timeout=timeout)
    551     except TimeoutExpired as exc:
    552         process.kill()

File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/subprocess.py:1196, in Popen.communicate(self, input, timeout)
   1194     self._stdin_write(input)
   1195 elif self.stdout:
-> 1196     stdout = self.stdout.read()
   1197     self.stdout.close()
   1198 elif self.stderr:

KeyboardInterrupt: 

Silhouette score#

Compute the mean Silhouette Coefficient of all samples. See scikit-learn documentation for details.

cgram.silhouette_score()
2     0.531540
3     0.447219
4     0.399584
5     0.377720
6     0.368591
7     0.330961
8     0.296052
9     0.285682
10    0.284024
11    0.276113
Name: silhouette_score, dtype: float64

Once computed, resulting Series is available as cgram.silhouette_. Calling the original method will recompute the score.

cgram.silhouette_.plot();
../_images/2a5b6517ad8b875b412b367402d2fd2575b1b78e8dd52b3d2df5d965928f60b8.png

Calinski and Harabasz score#

Compute the Calinski and Harabasz score, also known as the Variance Ratio Criterion. See scikit-learn documentation for details.

cgram.calinski_harabasz_score()
2     482.191469
3     441.677075
4     400.410025
5     411.175066
6     382.302322
7     352.464103
8     334.044560
9     316.429048
10    301.158645
11    287.334763
Name: calinski_harabasz_score, dtype: float64

Once computed, resulting Series is available as cgram.calinski_harabasz_. Calling the original method will recompute the score.

cgram.calinski_harabasz_.plot();
../_images/8ca8b2cffb2a3e538618f58b71bd2d1612a1763f2a69dc2c35c7816752438cb5.png

Davies-Bouldin score#

Compute the Davies-Bouldin score. See scikit-learn documentation for details.

cgram.davies_bouldin_score()
2     0.714064
3     0.943553
4     0.944215
5     0.973248
6     0.994783
7     1.075790
8     1.151462
9     1.200647
10    1.212565
11    1.230724
Name: davies_bouldin_score, dtype: float64

Once computed, resulting Series is available as cgram.davies_bouldin_. Calling the original method will recompute the score.

cgram.davies_bouldin_.plot();
../_images/14d2a060c7bfa00326d08849334f006c0545f92ea61a4394c3fdddb5fce55a06.png

Acessing labels#

Clustergram stores resulting labels for each of the tested options, which can be accessed as:

cgram.labels_
1 2 3 4 5 6 7 8 9 10 11
0 0 0 0 0 0 2 6 6 3 9 3
1 0 0 0 0 0 2 6 0 3 4 9
2 0 0 0 0 0 2 6 0 7 4 9
3 0 0 0 0 0 2 6 6 6 9 3
4 0 0 0 0 3 5 2 6 6 7 6
... ... ... ... ... ... ... ... ... ... ... ...
337 0 1 1 3 4 0 1 3 2 6 1
338 0 1 1 3 4 0 1 3 2 6 1
339 0 1 1 1 2 4 3 1 1 3 4
340 0 1 1 3 4 0 5 5 5 1 7
341 0 1 1 1 2 4 5 5 5 1 7

342 rows × 11 columns

Saving clustergram#

If you want to save your computed clustergram.Clustergram object to a disk, you can use pickle library:

import pickle

with open('clustergram.pickle','wb') as f:
    pickle.dump(cgram, f)
with open('clustergram.pickle','rb') as f:
    loaded = pickle.load(f)