Additional methods#
This notebooks provides an overview of built-in clustering performance evaluation, ways of accessing individual labels resulting from clustering and saving the object to disk.
Clustering performance evaluation#
Clustergam includes handy wrappers around a selection of clustering performance metrics offered by
scikit-learn
. Data which were originally computed on GPU are converted to numpy on the fly.
Let’s load the data and fit clustergram on Palmer penguins dataset. See the Introduction for its overview.
import seaborn
from sklearn.preprocessing import scale
from clustergram import Clustergram
seaborn.set(style='whitegrid')
df = seaborn.load_dataset('penguins')
data = scale(df.drop(columns=['species', 'island', 'sex']).dropna())
cgram = Clustergram(range(1, 12), n_init=10, verbose=False)
cgram.fit(data)
Matplotlib is building the font cache; this may take a moment.
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[1], line 1
----> 1 import seaborn
2 from sklearn.preprocessing import scale
3 from clustergram import Clustergram
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/__init__.py:2
1 # Import seaborn objects
----> 2 from .rcmod import * # noqa: F401,F403
3 from .utils import * # noqa: F401,F403
4 from .palettes import * # noqa: F401,F403
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/rcmod.py:5
3 import matplotlib as mpl
4 from cycler import cycler
----> 5 from . import palettes
8 __all__ = ["set_theme", "set", "reset_defaults", "reset_orig",
9 "axes_style", "set_style", "plotting_context", "set_context",
10 "set_palette"]
13 _style_keys = [
14
15 "axes.facecolor",
(...)
50
51 ]
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/palettes.py:9
5 import matplotlib as mpl
7 from .external import husl
----> 9 from .utils import desaturate, get_color_cycle
10 from .colors import xkcd_rgb, crayons
11 from ._compat import get_colormap
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/seaborn/utils.py:14
12 import matplotlib as mpl
13 from matplotlib.colors import to_rgb
---> 14 import matplotlib.pyplot as plt
15 from matplotlib.cbook import normalize_kwargs
17 from seaborn._core.typing import deprecated
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/pyplot.py:56
54 from cycler import cycler
55 import matplotlib
---> 56 import matplotlib.colorbar
57 import matplotlib.image
58 from matplotlib import _api
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/colorbar.py:19
16 import numpy as np
18 import matplotlib as mpl
---> 19 from matplotlib import _api, cbook, collections, cm, colors, contour, ticker
20 import matplotlib.artist as martist
21 import matplotlib.patches as mpatches
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/contour.py:15
13 import matplotlib as mpl
14 from matplotlib import _api, _docstring
---> 15 from matplotlib.backend_bases import MouseButton
16 from matplotlib.lines import Line2D
17 from matplotlib.path import Path
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/backend_bases.py:46
43 import numpy as np
45 import matplotlib as mpl
---> 46 from matplotlib import (
47 _api, backend_tools as tools, cbook, colors, _docstring, text,
48 _tight_bbox, transforms, widgets, is_interactive, rcParams)
49 from matplotlib._pylab_helpers import Gcf
50 from matplotlib.backend_managers import ToolManager
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/text.py:16
14 from . import _api, artist, cbook, _docstring
15 from .artist import Artist
---> 16 from .font_manager import FontProperties
17 from .patches import FancyArrowPatch, FancyBboxPatch, Rectangle
18 from .textpath import TextPath, TextToPath # noqa # Logically located here
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:1582
1578 _log.info("generated new fontManager")
1579 return fm
-> 1582 fontManager = _load_fontmanager()
1583 findfont = fontManager.findfont
1584 get_font_names = fontManager.get_font_names
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:1576, in _load_fontmanager(try_read_cache)
1574 _log.debug("Using fontManager instance from %s", fm_path)
1575 return fm
-> 1576 fm = FontManager()
1577 json_dump(fm, fm_path)
1578 _log.info("generated new fontManager")
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:1041, in FontManager.__init__(self, size, weight)
1038 try:
1039 for fontext in ["afm", "ttf"]:
1040 for path in [*findSystemFonts(paths, fontext=fontext),
-> 1041 *findSystemFonts(fontext=fontext)]:
1042 try:
1043 self.addfont(path)
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:280, in findSystemFonts(fontpaths, fontext)
278 fontpaths = []
279 else:
--> 280 installed_fonts = _get_fontconfig_fonts()
281 if sys.platform == 'darwin':
282 fontpaths = [*X11FontDirectories, *OSXFontDirectories]
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/site-packages/matplotlib/font_manager.py:258, in _get_fontconfig_fonts()
255 _log.warning( # fontconfig 2.7 implemented --format.
256 'Matplotlib needs fontconfig>=2.7 to query system fonts.')
257 return []
--> 258 out = subprocess.check_output(['fc-list', '--format=%{file}\\n'])
259 except (OSError, subprocess.CalledProcessError):
260 return []
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/subprocess.py:466, in check_output(timeout, *popenargs, **kwargs)
463 empty = b''
464 kwargs['input'] = empty
--> 466 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
467 **kwargs).stdout
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/subprocess.py:550, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
548 with Popen(*popenargs, **kwargs) as process:
549 try:
--> 550 stdout, stderr = process.communicate(input, timeout=timeout)
551 except TimeoutExpired as exc:
552 process.kill()
File ~/checkouts/readthedocs.org/user_builds/clustergram/conda/latest/lib/python3.12/subprocess.py:1196, in Popen.communicate(self, input, timeout)
1194 self._stdin_write(input)
1195 elif self.stdout:
-> 1196 stdout = self.stdout.read()
1197 self.stdout.close()
1198 elif self.stderr:
KeyboardInterrupt:
Silhouette score#
Compute the mean Silhouette Coefficient of all samples. See scikit-learn
documentation for details.
cgram.silhouette_score()
2 0.531540
3 0.447219
4 0.399584
5 0.377720
6 0.368591
7 0.330961
8 0.296052
9 0.285682
10 0.284024
11 0.276113
Name: silhouette_score, dtype: float64
Once computed, resulting Series is available as cgram.silhouette_
. Calling the original method will recompute the score.
cgram.silhouette_.plot();
Calinski and Harabasz score#
Compute the Calinski and Harabasz score, also known as the Variance Ratio Criterion. See scikit-learn
documentation for details.
cgram.calinski_harabasz_score()
2 482.191469
3 441.677075
4 400.410025
5 411.175066
6 382.302322
7 352.464103
8 334.044560
9 316.429048
10 301.158645
11 287.334763
Name: calinski_harabasz_score, dtype: float64
Once computed, resulting Series is available as cgram.calinski_harabasz_
. Calling the original method will recompute the score.
cgram.calinski_harabasz_.plot();
Davies-Bouldin score#
Compute the Davies-Bouldin score. See scikit-learn
documentation for details.
cgram.davies_bouldin_score()
2 0.714064
3 0.943553
4 0.944215
5 0.973248
6 0.994783
7 1.075790
8 1.151462
9 1.200647
10 1.212565
11 1.230724
Name: davies_bouldin_score, dtype: float64
Once computed, resulting Series is available as cgram.davies_bouldin_
. Calling the original method will recompute the score.
cgram.davies_bouldin_.plot();
Acessing labels#
Clustergram
stores resulting labels for each of the tested options, which can be accessed as:
cgram.labels_
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 2 | 6 | 6 | 3 | 9 | 3 |
1 | 0 | 0 | 0 | 0 | 0 | 2 | 6 | 0 | 3 | 4 | 9 |
2 | 0 | 0 | 0 | 0 | 0 | 2 | 6 | 0 | 7 | 4 | 9 |
3 | 0 | 0 | 0 | 0 | 0 | 2 | 6 | 6 | 6 | 9 | 3 |
4 | 0 | 0 | 0 | 0 | 3 | 5 | 2 | 6 | 6 | 7 | 6 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
337 | 0 | 1 | 1 | 3 | 4 | 0 | 1 | 3 | 2 | 6 | 1 |
338 | 0 | 1 | 1 | 3 | 4 | 0 | 1 | 3 | 2 | 6 | 1 |
339 | 0 | 1 | 1 | 1 | 2 | 4 | 3 | 1 | 1 | 3 | 4 |
340 | 0 | 1 | 1 | 3 | 4 | 0 | 5 | 5 | 5 | 1 | 7 |
341 | 0 | 1 | 1 | 1 | 2 | 4 | 5 | 5 | 5 | 1 | 7 |
342 rows × 11 columns
Saving clustergram#
If you want to save your computed clustergram.Clustergram
object to a disk, you can use pickle
library:
import pickle
with open('clustergram.pickle','wb') as f:
pickle.dump(cgram, f)
with open('clustergram.pickle','rb') as f:
loaded = pickle.load(f)