Clustering with Other Algorithms

“Far better an approximate answer to the right question, which is often vague, than an exact answer to the wrong question, which can always be made precise.”

-John Tukey, The future of data analysis, p 13

Introduction

This vignette serves as a code repository for clustering algorithms that take distances or similarities as input. To be clear, it is not a referendum on which clustering algorithm is best because there is no such thing as a best distance or best clustering algorithm or best validation method. Every clustering problem is a domain-specific problem that needs patience, iteration and domain-expertise to acquire usable results.

With that out of the way, please feel free to recommend clustering algorithms we may have missed by lodging an issue at https://github.com/bmuchmore/PreciseDist/issues

Data set-up

Data and set-up comes from the Cell Cycle Vignette - Experiment 5: Minkowski 100x. See that vignette for more details.

library(PreciseDist)
data("data_cell_cycle")
str(data_cell_cycle[1:5])
library(dplyr)
cell_cycle_data <- data_cell_cycle %>%
  dplyr::select(-Cell_cycle) %>%
  as.matrix()
cell_cycle_labels <- data_cell_cycle %>%
  dplyr::select(Cell_cycle) %>%
  as.matrix()
cell_cycle_minkowski_params <- seq(0.45, 0.54, length.out = 10)
cell_cycle_minkowski_funcs <- precise_func_fact(
  func = "minkowski",
  params = cell_cycle_minkowski_params
)
library(future)
library(doFuture)
registerDoFuture()
plan(multiprocess, workers = 10)
cell_cycle_minkowski_dists <- cell_cycle_data %>%
  as.matrix() %>%
  precise_dist(
    dist_funcs = cell_cycle_minkowski_funcs,
    time_series = FALSE,
    partitions = 10,
    suffix = "cell_minkowski_",
    file = "/absolute_path/to_somewhere/with_full_name/inclusing_the/file_extension.rds",
    parallel = TRUE,
    local_timeout = Inf,
    verbose = TRUE
  )
cell_cycle_minkowski_transformed <- cell_cycle_minkowski_dists  %>%
  precise_transform(transform = "laplacian")
cell_cycle_minkowski_fused <- precise_fusion(
  cell_cycle_minkowski_transformed,
  fusion = "fuse",
  verbose = TRUE
)
cell_cycle_minkowski_graph <- precise_graph(
  data = cell_cycle_minkowski_fused,
  method = 1,
  n_neighbors = 50,
  spread = 10,
  min_dist = 0.001,
  bandwidth = 10,
  parallel = TRUE,
  verbose = TRUE
)

Now that we have the graph, we will extract the distance, and call precise_transform() to ensure that it is, in fact, in distance format. Please note though that some functions require similarities. With those functions, we coax the distance into a similarity using proxy::proxy::pr_dist2simil():

cell_cycle_for_clustering <- cell_cycle_minkowski_graph$fused_dist %>%
  precise_transform(enforce_dist = TRUE)

Load data manipulation libraries

library(magrittr)
library(purrr)
library(tibble)
library(dplyr)
library(proxy)

Hierarchical Clustering

hclust_clusters <- cell_cycle_for_clustering %>%
  as.dist() %>%
  stats::hclust(
    method = "complete",
    members = NULL
  ) %>%
  stats::cutree(k = 3, h = NULL) %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Hclust_Clusters = value)

K-Means Clustering

kmeans_clusters <- cell_cycle_for_clustering %>%
  stats::kmeans(
    centers = 3,
    iter.max = 10,
    nstart = 1,
    algorithm = "Hartigan-Wong",
    trace = FALSE
  ) %>%
  .[["cluster"]] %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Kmeans_Clusters = value)

DBSCAN (Density-based spatial clustering of applications with noise)

library(dbscan)
dbscan_clusters <- cell_cycle_for_clustering %>%
  as.dist() %>%
  dbscan::dbscan(
    eps = 5,
    minPts = 5,
    weights = NULL,
    borderPoints = TRUE
  ) %>%
  .[["cluster"]] %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Dbscan_Clusters = value)

Hierarchical DBSCAN

library(dbscan)
hdbscan_clusters <- cell_cycle_data %>%
  dbscan::hdbscan(
    minPts = 5,
    xdist = as.dist(cell_cycle_for_clustering),
    gen_hdbscan_tree = FALSE,
    gen_simplified_tree = FALSE
  ) %>%
  .[["cluster"]] %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Hdbscan_Clusters = value)

DIvisive ANAlysis Clustering

library(cluster)
diana_clusters <- cell_cycle_for_clustering %>%
  as.dist() %>%
  cluster::diana(
    diss = TRUE,
    metric = NULL,
    stand = FALSE,
    stop.at.k = FALSE,
    keep.diss = FALSE,
    keep.data = FALSE,
    trace.lev = 0
  ) %>%
  stats::cutree(k = 3, h = NULL) %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Diana_Clusters = value)

Partitioning Around Medoids

library(cluster)
pam_clusters <-  cell_cycle_for_clustering %>%
  as.dist() %>%
  cluster::pam(
    k = 3,
    diss = TRUE,
    metric = NULL,
    medoids = NULL,
    stand = FALSE,
    cluster.only = TRUE,
    do.swap = TRUE,
    keep.diss = FALSE,
    keep.data = FALSE,
    pamonce = FALSE,
    trace.lev = 0
  ) %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Pam_Clusters = value)

Affinity Propagation

library(apcluster)
ap_clusters <- cell_cycle_for_clustering %>%
  proxy::pr_dist2simil() %>%
  apcluster::apcluster(
    p = NA,
    q = NA,
    maxits = 1000,
    convits = 100,
    lam = 0.9,
    includeSim = FALSE,
    details = FALSE,
    nonoise = FALSE,
    seed = NA
  )
ap_clusters <- ap_clusters@idx %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(AP_Clusters = value)

Affinity Propagation for Pre-defined Number of Clusters

library(apcluster)
apk_clusters <- cell_cycle_for_clustering %>%
  proxy::pr_dist2simil() %>%
  apcluster::apclusterK(
    K = 3,
    prc = 10,
    bimaxit = 20,
    exact = FALSE,
    maxits = 1000,
    convits = 100,
    lam = 0.9,
    includeSim = FALSE,
    details = FALSE,
    nonoise = FALSE,
    seed = NA,
    verbose = FALSE
  )
apk_clusters <- apk_clusters@idx %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(APk_Clusters = value)

Spectral Clustering

library(SNFtool)
spectral_clusters <- cell_cycle_for_clustering %>%
  proxy::pr_dist2simil() %>%
  SNFtool::spectralClustering(
    K = 3,
    type = 3
  ) %>%
  as.character() %>%
  map_chr(~paste("Cluster_", .x)) %>%
  as_tibble() %>%
  select(Spectral_Clusters = value)

Brian Muchmore

2018-09-26