\name{cqcluster.stats}
\alias{cqcluster.stats}
\alias{summary.cquality}
\alias{print.summary.cquality}
%- Also NEED an `\alias' for EACH other topic documented here.
\title{Cluster validation statistics (version for use with clusterbenchstats}
\description{
  This is a more sophisticated version of \code{\link{cluster.stats}}
  for use with \code{\link{clusterbenchstats}}, see Hennig (2017).
  Computes a number of distance-based statistics, which can be used for cluster
  validation, comparison between clusterings and decision about
  the number of clusters: cluster sizes, cluster diameters,
  average distances within and between clusters, cluster separation,
  biggest within cluster gap, 
  average silhouette widths, the Calinski and Harabasz index,
  a Pearson version of
  Hubert's gamma coefficient, the Dunn index, further statistics
  introduced
  in Hennig (2017) and two indexes
  to assess the similarity of two clusterings, namely the corrected Rand
  index and Meila's VI.
}
\usage{
cqcluster.stats(d = NULL, clustering, alt.clustering = NULL,
                             noisecluster = FALSE, 
    silhouette = TRUE, G2 = FALSE, G3 = FALSE, wgap = TRUE, sepindex = TRUE, 
    sepprob = 0.1, sepwithnoise = TRUE, compareonly = FALSE, 
    aggregateonly = FALSE, 
    averagegap=FALSE, pamcrit=TRUE,
    dquantile=0.1,
    nndist=TRUE, nnk=2, standardisation="max", sepall=TRUE, maxk=10,
    cvstan=sqrt(length(clustering)))

\method{summary}{cquality}(object,stanbound=TRUE,largeisgood=TRUE, ...)

\method{print}{summary.cquality}(x, ...)

			      
}
%- maybe also `usage' for other objects documented here.
\arguments{
  \item{d}{a distance object (as generated by \code{dist}) or a distance
    matrix between cases.}
  \item{clustering}{an integer vector of length of the number of cases,
    which indicates a clustering. The clusters have to be numbered
    from 1 to the number of clusters.}
  \item{alt.clustering}{an integer vector such as for
    \code{clustering}, indicating an alternative clustering. If provided, the
    corrected Rand index and Meila's VI for \code{clustering}
    vs. \code{alt.clustering} are computed.}
  \item{noisecluster}{logical. If \code{TRUE}, it is assumed that the
    largest cluster number in \code{clustering} denotes a 'noise
    class', i.e. points that do not belong to any cluster. These points
    are not taken into account for the computation of all functions of
    within and between cluster distances including the validation
    indexes.} 
  \item{silhouette}{logical. If \code{TRUE}, the silhouette statistics
    are computed, which requires package \code{cluster}.}
  \item{G2}{logical. If \code{TRUE}, Goodman and Kruskal's index G2
    (cf. Gordon (1999), p. 62) is computed. This executes lots of
    sorting algorithms and can be very slow (it has been improved
    by R. Francois - thanks!)}
  \item{G3}{logical. If \code{TRUE}, the index G3
    (cf. Gordon (1999), p. 62) is computed. This executes \code{sort}
    on all distances and can be extremely slow.}
  \item{wgap}{logical. If \code{TRUE}, the widest within-cluster gaps
    (largest link in within-cluster minimum spanning tree) are
    computed. This is used for finding a good number of clusters in
    Hennig (2013). See also parameter \code{averagegap}.}
  \item{sepindex}{logical. If \code{TRUE}, a separation index is
    computed, defined based on the distances for every point to the
    closest point not in the same cluster. The separation index is then
    the mean of the smallest proportion \code{sepprob} of these. This
    allows to formalise separation less sensitive to a single or a few
    ambiguous points. The output component corresponding to this is
    \code{sindex}, not \code{separation}! This is used for finding a
    good number of clusters in Hennig (2013). See also parameter
    \code{sepall}.}
  \item{sepprob}{numerical between 0 and 1, see \code{sepindex}.}
  \item{sepwithnoise}{logical. If \code{TRUE} and \code{sepindex} and
    \code{noisecluster} are both \code{TRUE}, the noise points are
    incorporated as cluster in the separation index (\code{sepindex})
    computation. Also
    they are taken into account for the computation for the minimum
    cluster separation.} 
  \item{compareonly}{logical. If \code{TRUE}, only the corrected Rand index
    and Meila's VI are
    computed and given out (this requires \code{alt.clustering} to be
    specified).}
  \item{aggregateonly}{logical. If \code{TRUE} (and not
    \code{compareonly}), no clusterwise but only aggregated information
    is given out (this cuts the size of the output down a bit).}
  \item{averagegap}{logical. If \code{TRUE}, the average of the widest
      within-cluster gaps over all clusters is given out; if
      \code{FALSE}, the maximum is given out.}
  \item{pamcrit}{logical. If \code{TRUE}, the average distance of points
    to their respective cluster centroids is computed (criterion of the
    PAM clustering method); centroids are chosen so that they minimise
    this criterion for the given clustering.}
  \item{dquantile}{numerical between 0 and 1; quantile used for kernel
    density estimator for density indexes, see Hennig (2019), Sec. 3.6.}
  \item{nndist}{logical. If \code{TRUE}, average distance to \code{nnk}th
    nearest neighbour within cluster is computed.}
  \item{nnk}{integer. Number of neighbours used in average and
    coefficient of
    variation of distance to nearest within cluster neighbour (clusters
    with \code{nnk} or fewer points are ignored for this).}
  \item{standardisation}{\code{"none"}, \code{"max"}, \code{"ave"},
    \code{"q90"}, or a number. See details.}
  \item{sepall}{logical. If \code{TRUE}, a fraction of smallest
    \code{sepprob} distances to other clusters is used from every
    cluster. Otherwise, a fraction of smallest \code{sepprob} distances
    overall is used in the computation of \code{sindex}.}
  \item{maxk}{numeric. Parsimony is defined as the number of clusters
    divided by \code{maxk}.}
  \item{cvstan}{numeric. \code{cvnnd} is standardised by \code{cvstan}
    if there is standardisation, see Details.}
  \item{object}{object of class \code{cquality}, output of \code{cqcluster.stats}.}
  \item{x}{object of class \code{cquality}, output of \code{cqcluster.stats}.}
  \item{stanbound}{logical. If \code{TRUE}, all index values larger than
    1 will be set to 1, and all values smaller than 0 will be set to 0.
    This is for preparation in case of \code{largeisgood=TRUE} (if
    values are already suitably standardised within
    \code{cqcluster.stats}, it won't do harm and can do good).}
  \item{largeisgood}{logical. If \code{TRUE}, indexes \code{x} are
    transformed to \code{1-x} in case that before transformation smaller
    values indicate a better clustering (that's \code{average.within,
      mnnd, widestgap, within.cluster.ss, dindex, denscut, pamc,
      max.diameter, highdgap, cvnnd}. For this to make sense,
    \code{cqcluster.stats} should be run with
    \code{standardisation="max"} and \code{summary.cquality} with
    \code{stanbound=TRUE}.} 
  \item{...}{no effect.}
}

\details{
  The \code{standardisation}-parameter governs the standardisation of
  the index values.
  \code{standardisation="none"} means that unstandardised
  raw values of indexes are given out. Otherwise, \code{entropy} will be
  standardised by the
  maximum possible value for the given number of clusters;
  \code{within.cluster.ss} and \code{between.cluster.ss} will be
  standardised by the overall sum of squares; \code{mnnd} will be
  standardised by the maximum distance to the \code{nnk}th nearest
  neighbour within cluster; \code{pearsongamma} will be standardised
  by adding 1 and dividing by 2; \code{cvnn} will be standardised by
  \code{cvstan} (the default is the possible maximum).

  \code{standardisation} allows options for the standardisation of
  \code{average.within, sindex, wgap, pamcrit, max.diameter,
  min.separation} and can be \code{"max"} (maximum distance),
  \code{"ave"} (average distance), \code{q90} (0.9-quantile of
  distances), or a positive number. \code{"max"} is the default and
  standardises all the listed indexes into the range [0,1].}


\note{
  Because \code{cqcluster.stats} processes a full dissimilarity matrix, it
  isn't suitable for large data sets. You may consider
  \code{\link{distcritmulti}} in that case.
}

\value{
  \code{cqcluster.stats} with \code{compareonly=FALSE} and
  \code{aggregateonly=FALSE} returns a list of type
  \code{cquality} containing the components
  \code{n, cluster.number, cluster.size,  min.cluster.size, noisen,
    diameter,
    average.distance, median.distance, separation, average.toother,
    separation.matrix, ave.between.matrix, average.between, average.within,
    n.between, n.within, max.diameter, min.separation,
    within.cluster.ss, clus.avg.silwidths, avg.silwidth,
    g2, g3, pearsongamma, dunn, dunn2, entropy, wb.ratio, ch, cwidegap,
    widestgap, corrected.rand, vi, sindex, svec, psep, stan, nnk, mnnd,
    pamc, pamcentroids, dindex, denscut, highdgap, npenalty, dpenalty,
    withindensp, densoc, pdistto, pclosetomode, distto, percwdens,
    percdensoc, parsimony, cvnnd, cvnndc}. Some of these are
    standardised, see Details. If
    \code{compareonly=TRUE}, only \code{corrected.rand, vi} are given
    out. If \code{aggregateonly=TRUE}, only \code{n, cluster.number,
    min.cluster.size, noisen, diameter,
    average.between, average.within,
    max.diameter, min.separation,
    within.cluster.ss, avg.silwidth,
    g2, g3, pearsongamma, dunn, dunn2, entropy, wb.ratio, ch, 
    widestgap, corrected.rand, vi, sindex, svec, psep, stan, nnk, mnnd,
    pamc, pamcentroids, dindex, denscut, highdgap, parsimony, cvnnd,
    cvnndc} are given out.
  
  \code{summary.cquality} returns a list of type \code{summary.cquality}
  with components \code{average.within,nnk,mnnd,
              avg.silwidth,
              widestgap,sindex,
              pearsongamma,entropy,pamc,
              within.cluster.ss,
              dindex,denscut,highdgap,
              parsimony,max.diameter,
              min.separation,cvnnd}. These are as documented below for
  \code{cqcluster.stats}, but after transformation by \code{stanbound}
  and \code{largeisgood}, see arguments.

  \item{n}{number of points.}
  \item{cluster.number}{number of clusters.}
  \item{cluster.size}{vector of cluster sizes (number of points).}
  \item{min.cluster.size}{size of smallest cluster.}
  \item{noisen}{number of noise points, see argument \code{noisecluster}
    (\code{noisen=0} if \code{noisecluster=FALSE}).}
  \item{diameter}{vector of cluster diameters (maximum within cluster
    distances).}
  \item{average.distance}{vector of clusterwise
    within cluster average distances.}
  \item{median.distance}{vector of clusterwise
    within cluster distance medians.}
  \item{separation}{vector of clusterwise minimum distances of a point
    in the cluster to a point of another cluster.}
  \item{average.toother}{vector of clusterwise average distances of a point
    in the cluster to the points of other clusters.}
  \item{separation.matrix}{matrix of separation values between all pairs
    of clusters.}
  \item{ave.between.matrix}{matrix of mean dissimilarities between
  points of every pair of clusters.}
  \item{avebetween}{average distance between clusters.}
  \item{avewithin}{average distance within clusters (reweighted so
  that every observation, rather than every distance, has the same weight).}
  \item{n.between}{number of distances between clusters.}
  \item{n.within}{number of distances within clusters.}
  \item{maxdiameter}{maximum cluster diameter.}
  \item{minsep}{minimum cluster separation.}
  \item{withinss}{a generalisation of the within clusters sum
    of squares (k-means objective function), which is obtained if
    \code{d} is a Euclidean distance matrix.  For general distance
    measures, this is half
    the sum of the within cluster squared dissimilarities divided by the
    cluster size.}
  \item{clus.avg.silwidths}{vector of cluster average silhouette
    widths. See
    \code{\link[cluster]{silhouette}}.}
  \item{asw}{average silhouette
    width. See \code{\link[cluster]{silhouette}}.}
  \item{g2}{Goodman and Kruskal's Gamma coefficient. See Milligan and
    Cooper (1985), Gordon (1999, p. 62).}
  \item{g3}{G3 coefficient. See Gordon (1999, p. 62).}
  \item{pearsongamma}{correlation between distances and a
    0-1-vector where 0 means same cluster, 1 means different clusters.
    "Normalized gamma" in Halkidi et al. (2001).}
  \item{dunn}{minimum separation / maximum diameter. Dunn index, see
    Halkidi et al. (2002).}
  \item{dunn2}{minimum average dissimilarity between two cluster /
    maximum average within cluster dissimilarity, another version of
    the family of Dunn indexes.} 
  \item{entropy}{entropy of the distribution of cluster memberships,
    see Meila(2007).}
  \item{wb.ratio}{\code{average.within/average.between}.}
  \item{ch}{Calinski and Harabasz index (Calinski and Harabasz 1974,
    optimal in Milligan and Cooper 1985; generalised for dissimilarites
    in Hennig and Liao 2013).}
  \item{cwidegap}{vector of widest within-cluster gaps.}
  \item{widestgap}{widest within-cluster gap or average of cluster-wise
    widest within-cluster gap, depending on parameter \code{averagegap}.}
  \item{corrected.rand}{corrected Rand index (if \code{alt.clustering}
    has been specified), see Gordon (1999, p. 198).}
  \item{vi}{variation of information (VI) index (if \code{alt.clustering}
    has been specified), see Meila (2007).}
  \item{sindex}{separation index, see argument \code{sepindex}.}
  \item{svec}{vector of smallest closest distances of points to next
    cluster that are used in the computation of \code{sindex} if
    \code{sepall=TRUE}.}
  \item{psep}{vector of all closest distances of points to next
      cluster.}
  \item{stan}{value by which som statistics were standardised, see
    Details.}
  \item{nnk}{value of input parameter \code{nnk}.}
  \item{mnnd}{average distance to \code{nnk}th nearest neighbour within
    cluster.}
  \item{pamc}{average distance to cluster centroid.}
  \item{pamcentroids}{index numbers of cluster centroids.}
  \item{dindex}{this index measures to what extent the density decreases
    from the cluster mode to the outskirts; I-densdec in Sec. 3.6 of
    Hennig (2019); low values are good.}
  \item{denscut}{this index measures whether cluster boundaries run
    through density valleys; I-densbound in Sec. 3.6 of Hennig (2019); low
    values are good.}
  \item{highdgap}{this measures whether there is a large within-cluster
    gap with high density on both sides; I-highdgap in Sec. 3.6 of
    Hennig (2019); low values are good.}
  \item{npenalty}{vector of penalties for all clusters that are used
    in the computation of \code{denscut}, see Hennig (2019) (these are
    sums of penalties over all points in the cluster).}
  \item{depenalty}{vector of penalties for all clusters that are used in
    the computation of \code{dindex}, see Hennig (2019) (these are
    sums of several penalties for density increase when going from the
    mode outward in the cluster).}
  \item{withindensp}{distance-based kernel density values for all points
    as computed in Sec. 3.6 of Hennig (2019).}
  \item{densoc}{contribution of points from other clusters than the one
    to which a point is assigned to the density, for all points; called
    \code{h_o} in Sec. 3.6 of Hennig (2019).}
  \item{pdistto}{list that for all clusters has a sequence of point
    numbers. These are the points already incorporated in the sequence
    of points constructed in the algorithm in Sec. 3.6 of Hennig (2019) to
    which the next point to be joined is connected.}
  \item{pclosetomode}{list that for all clusters has a sequence of point
    numbers. Sequence of points to be incorporated in the sequence
    of points constructed in the algorithm in Sec. 3.6 of Hennig
    (2019).}
  \item{distto}{list that for all clusters has a sequence of differences
    between the standardised densities (see \code{percwdens}) at the new
    point added and the point to which
    it is connected (if this is positive, the penalty is this to the
    square), in the algorithm in Sec. 3.6 of Hennig (2019).}
  \item{percwdens}{this is \code{withindensp} divided by its maximum.}
  \item{percdensoc}{this is \code{densoc} divided by the maximum of
   \code{withindensp}, called \code{h_o^*} in Sec. 3.6 of Hennig (2019).}
 \item{parsimony}{number of clusters divided by \code{maxk}.}
 \item{cvnnd}{coefficient of variation of dissimilarities to
   \code{nnk}th nearest within-cluster neighbour, measuring uniformity of
   within-cluster densities, weighted over all clusters, see Sec. 3.7 of
   Hennig (2019).}
 \item{cvnndc}{vector of cluster-wise coefficients of variation of
   dissimilarities to \code{nnk}th nearest wthin-cluster neighbour as
  required in computation of \code{cvnnd}.}
}
\references{
Akhanli, S. and Hennig, C. (2020) Calibrating and aggregating cluster
validity indexes for context-adapted comparison of clusterings.
\emph{Statistics and Computing}, 30, 1523-1544,
\url{https://link.springer.com/article/10.1007/s11222-020-09958-2}, \url{https://arxiv.org/abs/2002.01822}

    Calinski, T., and Harabasz, J. (1974) A Dendrite Method for Cluster 
Analysis, \emph{Communications in Statistics}, 3, 1-27.

  Gordon, A. D. (1999) \emph{Classification}, 2nd ed. Chapman and Hall.

  Halkidi, M., Batistakis, Y., Vazirgiannis, M. (2001) On Clustering
  Validation Techniques, \emph{Journal of Intelligent Information
    Systems}, 17, 107-145.

  Hennig, C. and Liao, T. (2013) How to find an appropriate clustering
  for mixed-type variables with application to socio-economic
  stratification, \emph{Journal of the Royal Statistical Society, Series
  C Applied Statistics}, 62, 309-369.

  Hennig, C. (2013) How many bee species? A case study in
determining the number of clusters. In: Spiliopoulou, L. Schmidt-Thieme,
R. Janning (eds.):
"Data Analysis, Machine Learning and Knowledge Discovery", Springer,
Berlin, 41-49.

Hennig, C. (2019) Cluster validation by measurement of clustering
  characteristics relevant to the user. In C. H. Skiadas (ed.)
  \emph{Data Analysis and Applications 1: Clustering and Regression,
  Modeling-estimating, Forecasting and Data Mining, Volume 2}, Wiley,
  New York 1-24,
    \url{https://arxiv.org/abs/1703.09282}

    Kaufman, L. and Rousseeuw, P.J. (1990). "Finding Groups in Data:
  An Introduction to Cluster Analysis". Wiley, New York.
  
  Meila, M. (2007) Comparing clusterings?an information based distance,
  \emph{Journal of Multivariate Analysis}, 98, 873-895.
  
  Milligan, G. W. and Cooper, M. C. (1985) An examination of procedures
  for determining the number of clusters. \emph{Psychometrika}, 50, 159-179.
}
\author{Christian Hennig
  \email{christian.hennig@unibo.it}
  \url{https://www.unibo.it/sitoweb/christian.hennig/en/}
}
\seealso{
  \code{\link{cluster.stats}},
  \code{\link[cluster]{silhouette}}, \code{\link{dist}}, \code{\link{calinhara}},
  \code{\link{distcritmulti}}.
  \code{\link{clusterboot}} computes clusterwise stability statistics by
  resampling.
}
\examples{  
  set.seed(20000)
  options(digits=3)
  face <- rFace(200,dMoNo=2,dNoEy=0,p=2)
  dface <- dist(face)
  complete3 <- cutree(hclust(dface),3)
  cqcluster.stats(dface,complete3,
                alt.clustering=as.integer(attr(face,"grouping")))
  
}
\keyword{cluster}% at least one, from doc/KEYWORDS
\keyword{multivariate}



