*------------------------------------------------------------------------
* The Common Algorithm for data clustering
* by Andy Allinger, 2011-2017, released to the public domain
* This program may be used by any person for any purpose.
*
* Refer to:
*   Robust Cluster Analysis Via Mixtures Of Multivariate t-Distributions
*   Geoffrey J. McLachlan and Donald Peel
*   University of Queensland
*
* and:
*   Roubust mixture modelling using the t distribution
*   D. Peel and G. J. McLachlan
*   Statistics and Computing v.10 pp.339-348, 2000.
*------------------------------------------------------------------------

*-----------------------------------------------------------------------
* dtpfc - distance to point from center
*
*___Name__________Type_______________In/Out___Description_______________
*   X(P)          Real               In       Datum vector
*   P             Integer            In       Length of X
*   T(P)          Real               In       Period of modulo arithmetic
*   CAT(P)        Integer            In       # categories each variable
*   CATTOT        Integer            In       Total number of categories
*   LIMCON        Integer            In       # continuous variables
*   LIMCAT        Integer            In       # categorical variables
*   C(P)          Real               In       Centroid vector
*   TAL(CATTOT)   Real               In       Count of categorical data
*   POP           Real               In       Cluster population
*   LIT           Real               In       Negligible value
*   DS(LIMCON)    Double Precision   Out      Difference, scalars
*   DT(LIMCAT)    Double Precision   Out      Difference, categoricals
*-----------------------------------------------------------------------
      SUBROUTINE DTPFC (X, P, T, CAT, CATTOT, LIMCON, LIMCAT, 
     $                    C, TAL, POP, LIT, DS, DT)
       IMPLICIT NONE
       INTEGER P, CAT, CATTOT, LIMCON, LIMCAT
       REAL X, T, C, TAL, POP, LIT
       DOUBLE PRECISION DS, DT
       DIMENSION X(P), T(P), CAT(P), C(P), TAL(CATTOT),
     $             DS(LIMCON), DT(LIMCAT)

*            local variables
       INTEGER H, HCUM, J, JS, JT
       DOUBLE PRECISION DIF, HT
       
*-----------------------------------------------------------------------
*          begin
*-----------------------------------------------------------------------
       HCUM = 0
       JS = 0
       JT = 0
       DO J = 1, P
         IF (CAT(J) .EQ. 0) THEN
           JS = JS + 1
           IF (T(J) < LIT) THEN                ! scalar
             DS(JS) = X(J) - C(J)
           ELSE                                ! modulo
             HT = 0.5 * T(J)
             DIF = MOD(X(J) - C(J), T(J))
             IF (DIF < -HT) THEN
               DIF = DIF + T(J)
             ELSE IF (DIF > HT) THEN
               DIF = DIF - T(J)
             END IF
             DS(JS) = DIF
           END IF
         ELSE
           JT = JT + 1
           H = HCUM + NINT(X(J))
           IF (T(J) > -2.5) THEN             ! categorical
             DIF = (TAL(H) + 1.) / (POP + FLOAT(CAT(J)))  ! Laplace smoothing
           ELSE                              ! contra-categorical
             DIF = (POP - TAL(H) + 1.) / 
     $               ((POP + 1.) * FLOAT(CAT(J)) - POP)
           END IF
           DT(JT) = DIF
           HCUM = HCUM + CAT(J)
         END IF ! cat > 0 ?
       END DO ! next j
       RETURN
      END ! of DTPFC


************************************************************************
* tclust - Mixture decomposition with the Student t distribution
*
*___Name_______________Type______I/O____Description_____________________
*   X(PLIM,N)          Real      in     data points    
*   PLIM               Integer   in     row dimension of x
*   P                  Integer   in     dimension of the data  
*   N                  Integer   in     number of points   
*   T(P)               Real      in     period of modulo arithmetic
*   CAT(P)             Integer   in     # categories each variable 
*   CATTOT             Integer   in     total # categories
*   M                  Integer   in     number of input sources
*   C(PLIM,KMAX)       Real      out    cluster centers
*   KMIN               Integer   in     fewest clusters to make
*   K                  Integer   both   number of clusters to make
*   KMAX               Integer   in     most clusters to make
*   TAL(CATTOT,KMAX)   Real      work   category counts each cluster
*   Z(N)               Integer   out    maximal cluster membership each point
*   POP(KMAX)          Real      work   cluster population
*   ROBUST             Logical   in     use weighted median
*   IND(N)             Integer   in     permutation vector
*   WORK(N)            Real      work   temp space of X
*   QW(N)              Real      work   weights of weighted averages
*   AVE(N)             Real      work   result array for WCYMEN
*   ITER               Integer   in     quit after this many iterations
*   IERR               Integer   out    error code
************************************************************************
      SUBROUTINE TCLUST (X, PLIM, P, N, T, CAT, CATTOT, M, C, KMIN, K, 
     $  KMAX, TAL, Z, POP, ROBUST, IND, WORK, QW, AVE, ITER, IERR)

       IMPLICIT NONE
       INTEGER PLIM, P, N, CAT, CATTOT, M, KMIN, K, KMAX, Z, IND, ITER, 
     $  IERR
       REAL X, T, C, TAL, POP, WORK, QW, AVE
       LOGICAL ROBUST
       DIMENSION X(PLIM,N), T(P), CAT(P), C(PLIM,KMAX), 
     $  TAL(CATTOT,KMAX), Z(N), POP(KMAX), 
     $  IND(N), WORK(N), QW(N), AVE(N)
       
************************************************************************
*         Constants.  
************************************************************************
       INTEGER MAXIT, NP
       REAL VIS0, VIS9
       DOUBLE PRECISION PI, TOLPAR
       LOGICAL dump
       PARAMETER (
     $            dump = .true.,              ! turn on ticker tape
     $            MAXIT = 500,                 ! max iterations
     $            NP = 18,                     ! length of primes array
     $            PI = 3.141592653589793D0,    ! factor in Student pdf
     $            TOLPAR = 1D-4,               ! set convergence tolerance
     $            VIS0 = 0.001,                ! initial viscosity
     $            VIS9 = 0.500)                ! final viscosity
       
*               local variables
       INTEGER 
     $         G, H, i, J, J1, J2, K0, L,    ! counters
     $         HCUM,                         ! offset into TAL
     $         IALL,                         ! allocation error #
     $         KP,                           ! which prime constant
     $         LIMCAT, LIMCON,               ! array bounds
     $         PCAT, PCON,                   ! # categorical and continuous
     $         PRIME(NP),                    ! for shuffling
     $         RANK                          ! rank of correlation matrix
     
       REAL 
     $      ETA,                     ! numerical viscosity
     $      SEPS, SLIT,              ! machine constants
     $      Y,                       ! temp scalar
     $      VISPOW                   ! viscosity update exponent
     
       DOUBLE PRECISION
     $                  CTERM,       ! term in Student pdf
     $                  D,           ! temp scalar
     $                  DEPS, DLIT,  ! double precision machine constants
     $                  LAD,         ! log(abs(det(R)))
     $                  NU,          ! degrees of freedom of Student dist
     $                  NUPP,        ! nu+p
!     $                  ODK,         ! 1/k
     $                  ODN,         ! 1/n
     $                  QT,          ! sum of Q over L
     $                  TOL,         ! convergence tolerance
     $                  TOT          ! a total
     
*              allocatable arrays
       REAL, DIMENSION (:,:), ALLOCATABLE :: Q, QOLD, W
       DOUBLE PRECISION, DIMENSION (:), ALLOCATABLE :: DQ, DS1, DS2, DT,
     $                                                  DEV, S
       DOUBLE PRECISION, DIMENSION (:,:), ALLOCATABLE :: R, U, V
       
*              external functions
       INTEGER HAT                 ! draw random #
       REAL RMACHINE
       DOUBLE PRECISION DMACHINE !!,
       LOGICAL DSAFDIV             ! safe to divide function
 
       SAVE PRIME
       DATA PRIME / 907, 911, 919, 929, 937, 941, 947, 953, 967, 
     $  971, 977, 983, 991, 997, 1009, 1013, 1019, 1021 /
       
************************************************************************
*                      begin       
************************************************************************
       IERR = 0 
       SEPS = RMACHINE(1)
       SLIT = RMACHINE(2)
       DEPS = DMACHINE(1)
       DLIT = DMACHINE(2)
       KP = HAT(NP)
       
       IF (K .LE. 1 .OR. K .GE. N) THEN   ! validate
         IERR = 3
         RETURN
       END IF
       
*          count categories
       PCON = 0
       PCAT = 0
       DO J = 1, P
         IF (CAT(J) .EQ. 0) THEN
           PCON = PCON + 1
         ELSE
           PCAT = PCAT + 1
         END IF
       END DO
       LIMCON = MAX(PCON, 1)
       LIMCAT = MAX(PCAT, 1)
       
*          request workspace
       ALLOCATE (DEV(LIMCON), DS1(LIMCON), DS2(LIMCON), DT(LIMCAT), 
     $  Q(N,KMAX), QOLD(N,KMAX), W(N,KMAX), DQ(KMAX), S(LIMCON), 
     $  R(LIMCON,LIMCON), U(LIMCON,LIMCON), V(LIMCON,LIMCON), STAT=IALL)
       IF (IALL .NE. 0) THEN
         PRINT *, 'tclust: trouble getting memory'
         IERR = 7
         RETURN
       END IF
       
       NU = DBLE(MAX(1, M - 1))               ! small degree of freedom
!!       NU = DBLE(MAX(1, N - KMAX - PCON))  ! large degree of freedom
       NUPP = NU + PCON
       ODN = 1. / DBLE(N)
       CTERM = ALGAMA(NUPP / 2.D0) - ALGAMA(NU / 2.D0) -
     $             0.5D0 * PCON * LOG(PI * NU)
!       TOL = TOLPAR * DBLE(N)
       TOL = 3 * N * KMAX * SEPS
       PRINT *, 'using tol of ', TOL
       ETA = VIS0
       VISPOW = (LOG(VIS9) / LOG(VIS0)) **(1. / FLOAT(MAXIT - 1))

*        set Q from initial partition
       DO i = 1, N
         L = Z(i)
         DO J = 1, K
           IF (J .EQ. L) THEN
             Q(i,J) = 1.
           ELSE
             Q(i,J) = 0.
           END IF
         END DO ! next j
       END DO ! next i
       
*        Uniform initial W
       DO i = 1, N
         DO L = 1, K
           W(i,L) = 1.
         END DO
       END DO
       
************************************************************************
*          main loop 
************************************************************************
       DO G = 1, MAXIT
       
*             quit?
         IF (G > ITER) GO TO 90       
         
*            populations
         DO L = 1, K
           CALL ADD (Q(1,L), N, POP(L))
         END DO
         
************************************************************************
*             delete empty clusters
************************************************************************
         K0 = K
         DO L = K0, 1, -1
           IF (POP(L) < SEPS) THEN
             IF (K .LE. KMIN) THEN
               DEALLOCATE (DEV, DS1, DS2, DT, Q, QOLD, W,
     $             DQ, S, R, U, V, STAT=IALL)
               IERR = 1
               RETURN
             END IF  ! too few?
             DO J = 1, P
               C(J,L) = C(J,K)
             END DO
             DO H = 1, CATTOT
               TAL(H,L) = TAL(H,K)
             END DO
             DO i = 1, N
               Q(i,L) = Q(i,K)
               QOLD(i,L) = QOLD(i,K)
               W(i,L) = W(i,K)
             END DO
             POP(L) = POP(K)
             POP(K) = 0.
             K = K - 1
           END IF  ! empty?
         END DO  ! next L
                   
************************************************************************
*          let each center be a weighted average
************************************************************************
         DO L = 1, K
           DO H = 1, CATTOT            ! clear tally
             TAL(H,L) = 0.
           END DO
           HCUM = 0
           DO J = 1, P
             IF (CAT(J) .EQ. 0) THEN      ! continous and cyclic
               DO i = 1, N
                 WORK(i) = X(J,i)         ! data
                 QW(i) = Q(i,L) * W(i,L)  ! weight
               END DO
         
               IF (T(J) > SLIT) THEN        ! modulo arithmetic
                 CALL WCYMEN (WORK, QW, IND, N, T(J), AVE, i)
                 IF (i .EQ. 0) THEN
                   PRINT *, 'WCYMEN returns null answer!'
                   PRINT *, 'work: ', WORK
                   PRINT *, 'qw: ', QW
                   PRINT *, 'n: ', N
                   PRINT *, 't(j): ', T(J)
                   STOP
                 END IF
                 i = MOD(PRIME(KP) * L, i) + 1
                 C(J,L) = AVE(i) 
               ELSE IF (ROBUST) THEN ! scalar data
                 CALL WMED (WORK, QW, IND, N, C(J,L))
               ELSE                             ! weighted mean
                 CALL WMEAN (WORK, QW, N, C(J,L))
               END IF   ! cyclic ?
               
             ELSE
               C(J,L) = 0.
               DO i = 1, N
                 H = HCUM + NINT(X(J,i))
                 TAL(H,L) = TAL(H,L) + Q(i,L)
               END DO
               HCUM = HCUM + CAT(J)
             END IF ! continuous or categorical ?
             
           END DO     ! next j
         END DO    ! next L

*                  skip ahead if all categorical variables
         LAD = 0.D0
         IF (PCON < 1) GO TO 50

************************************************************************
*          deviations of continuous variables
************************************************************************
         DO J = 1, PCON
           DEV(J) = 0.D0
         END DO
         
         DO i = 1, N
           DO L = 1, K
             CALL DTPFC (X(1,i), P, T, CAT, CATTOT, LIMCON, LIMCAT, 
     $                    C(1,L), TAL(1,L), POP(L), SLIT, DS1, DT)
             DO J = 1, PCON
               DEV(J) = DEV(J) + Q(i,L) * DS1(J) **2
             END DO   ! next dimension
           END DO  ! next center
         END DO  ! next object
         
         DO J = 1, PCON
           DEV(J) = SQRT(DEV(J) * ODN)
         END DO

************************************************************************
*             compute the pooled correlation matrix
************************************************************************
         DO J2 = 1, PCON                    ! zero
           DO J1 = J2, PCON
             R(J1,J2) = 0.D0
           END DO 
         END DO
         TOT = 0.

         DO L = 1, K
           DO i = 1, N
           
!!!!!!!!!!!!!!!!
                IF (ISNAN(Q(i,L))) STOP 'NaN in Q at pooled correl!'
!!!!!!!!!!!!!!!!!!!!!!!!!!!                

             CALL DTPFC (X(1,i), P, T, CAT, CATTOT, LIMCON, LIMCAT, 
     $                    C(1,L), TAL(1,L), POP(L), SLIT, DS1, DT)
     
*                  standardize 
             DO J = 1, PCON
               IF (DSAFDIV(DS1(J), DEV(J))) DS1(J) = DS1(J) / DEV(J)
               IF (ISNAN(DS1(J))) STOP 'NaN in DS1!'
             END DO
             D = DPROD(Q(i,L), W(i,L)) 

*               accumulate             
             DO J2 = 1, PCON
               DO J1 = J2, PCON
                 R(J1,J2) = R(J1,J2) + D * (DS1(J1) * DS1(J2))                 
               END DO    ! next J1
             END DO    ! next J2
             TOT = TOT + D  ! Tyler & Vardi's formula
           END DO    ! next center
         END DO    ! next point         
         IF (DSAFDIV(1.D0, TOT)) THEN
           D = 1. / TOT
         ELSE
           D = ODN
         END IF

         DO J2 = 1, PCON
           DO J1 = J2, PCON
             R(J1,J2) = R(J1,J2) * D
             R(J2,J1) = R(J1,J2)         ! fill upper triangle
             IF (ISNAN(R(J1,J2))) STOP 'NaN in R!'
           END DO 
         END DO

*             pad the diagonal
        DO J = 1, PCON
          R(J,J) = R(J,J) + DEPS * PCON
        END DO

!!!!!!!!!!!!!!!              dump correl matrix         
!          PRINT *, 'using correlation matrix: '
  22      FORMAT (G14.4, $)
  23      FORMAT ()
!          DO J1 = 1, PCON  
!            DO J2 = 1, PCON
!              PRINT 22, R(J1,J2)
!            END DO
!            PRINT 23
!          END DO
!!!!!!!!!!!!!!!!!!!!!!!!          

*          pseudo-invert 
         CALL DRDI (R, PCON, RANK, LAD, U, S, V, DS1, DS2, IERR)
         IF (RANK < 1) THEN
           IERR = 4
           DEALLOCATE (DEV, DS1, DS2, DT, Q, QOLD, W,
     $             DQ, S, R, U, V, STAT=IALL)
           RETURN
         END IF

         IF (IERR .NE. 0) THEN
           IERR = 5
           DEALLOCATE (DEV, DS1, DS2, DT, Q, QOLD, W,
     $             DQ, S, R, U, V, STAT=IALL)
           RETURN
         END IF
         
  50     DO L = 1, K    ! save old memberships
           DO i = 1, N
             QOLD(i,L) = Q(i,L)
           END DO
         END DO

************************************************************************
*           membership Q each object each cluster
************************************************************************
         DO i = 1, N
           DO L = 1, K
             CALL DTPFC (X(1,i), P, T, CAT, CATTOT, LIMCON, LIMCAT,
     $                   C(1,L), TAL(1,L), POP(L), SLIT, DS1, DT)
              
*              standardize
             DO J = 1, PCON
               IF (DSAFDIV(DS1(J), DEV(J))) DS1(J) = DS1(J) / DEV(J)
             END DO

*               multivariate t log probability (Mahalanobis distance)
             DO J = 1, PCON ; DS2(J) = 0.D0 ; END DO
             DO J2 = 1, PCON
               DO J1 = 1, PCON
                 DS2(J1) = DS2(J1) + R(J1,J2) * DS1(J2)
               END DO
             END DO 
             D = 0.D0
             DO J = 1, PCON
               D = D + DS1(J) * DS2(J)
             END DO
             
             IF (D < 0.D0) THEN
               PRINT *, 'NEG mahalanobis dist! ', D
               D = 0.D0
!!!!!!!!!!!!!!!              dump correl matrix         
         PRINT *, 'spectrum:  ', S
          PRINT *, 'inverse correlation matrix: '
          DO J1 = 1, PCON  
           DO J2 = 1, PCON
              PRINT 22, R(J1,J2)
            END DO
            PRINT 23
          END DO
!!!!!!!!!!!!!!!!!!!!!!!!          
             END IF
             
             W(i,L) = SNGL(NUPP / (NU + D))      ! update weight 
             TOT = NUPP * LOG(1.D0 + D / NU)     ! Student log prob
             TOT = CTERM - 0.5D0 * (LAD + TOT)

*                   add in categorical log probabilities
             DO J = 1, PCAT
               DT(J) = LOG(DT(J))
             END DO
             CALL DADD (DT, PCAT, D)
             TOT = TOT + D
             DQ(L) = TOT                  ! argument to exponent
           END DO  ! next L
           
*             reduce magnitudes of log probabilities for numerical purposes
           D = DQ(1)
           DO L = 1, K
             D = MAX(D, DQ(L))
           END DO
           DO L = 1, K
             DQ(L) = DQ(L) - D
           END DO
             
*                  multivariate probability density
!!           ODK = 1.D0 / DBLE(K)
           DO L = 1, K
             DQ(L) = EXP(DQ(L)) * DBLE(POP(L)) * ODN  ! multiply by prior prob.
!!             DQ(L) = EXP(DQ(L)) * ODK                  ! uniform prior prob.
           END DO     ! next L
           CALL DADD (DQ, K, QT)
           
************************************************************************
*                 make a new cluster?
************************************************************************
           IF (QT .LE. 0.D0) THEN
             PRINT *, 'tclust: zero probability object: ', i
             IERR = 9
             RETURN           
           ELSE IF (QT < DLIT) THEN
             IF (K .GE. KMAX) THEN
               PRINT *, 'tclust:  too many clusters to insert'
               IERR = 8
               RETURN
             END IF
             DO L = 1, K                 ! erase memberships in other clusters
               Q(I,L) = 0.
             END DO
             K = K + 1
             DO H = 1, N
               IF (H .EQ. I) THEN
                 Q(H,K) = 1.
                 QOLD(H,K) = 1.
               ELSE
                 Q(H,K) = 0.
                 QOLD(H,K) = 0.
               END IF
             END DO
             DO H = 1, N
               W(H,K) = 1.
             END DO
             DO J = 1, P
               C(J,K) = X(J,I)
             END DO
             DO H = 1, CATTOT             ! clear tally
               TAL(H,K) = 0.
             END DO
             HCUM = 0
             DO J = 1, P                  ! count the lone object
               IF (CAT(J) > 0) THEN
                 H = HCUM + NINT(X(J,I))
                 TAL(H,K) = 1.
                 HCUM = HCUM + CAT(J)
               END IF
             END DO
             POP(K) = 1.
      
************************************************************************
*            update memberships for this object
************************************************************************
           ELSE IF (DSAFDIV(1.D0, QT)) THEN
             QT = 1.D0 / QT
             DO L = 1, K                  ! normalize probability
               Q(i,L) = SNGL(DQ(L) * QT)
             END DO
           ELSE
             PRINT *, 'tclust: cant divide by qt !'
             DO L = 1, K
               Q(i,L) = 1. / FLOAT(K)
             END DO
           END IF
         END DO         ! next i

************************************************************************
*               check for convergence
************************************************************************
         TOT = 0.D0
         DO L = 1, K
           DO i = 1, N
             TOT = TOT + (Q(i,L) - QOLD(i,L)) **2
           END DO
         END DO         
         IF (TOT < TOL) GO TO 90
         
*                 apply viscosity
         DO L = 1, K
           DO i = 1, N
             Q(i,L) = ETA * QOLD(i,L) + (1. - ETA) * Q(i,L)
           END DO
         END DO

         IF (dump) THEN
           PRINT *, 'tclust: iter ', G, ' clusters: ', K,  
     $      ' rank: ', RANK, ' at convergence test, tot: ', TOT
         END IF
     
         ETA = ETA **VISPOW
       END DO  ! next g

       IERR = 2      ! too many iterations
       DEALLOCATE (DEV, DS1, DS2, DT, Q, QOLD, W,
     $            DQ, S, R, U, V, STAT=IALL)
       RETURN
       
************************************************************************
*             set Z to maximum membership
************************************************************************
  90     DO i = 1, N
           Y = 0.
           DO L = 1, K
             IF (Q(i,L) > Y) THEN
               Y = Q(i,L)
               Z(i) = L
             END IF
           END DO
         END DO
                
*           modes
       DO L = 1, K
         HCUM = 0
         DO J = 1, P
           IF (CAT(J) > 0) THEN
             Y = 0.
             DO H = 1, CAT(J) 
               IF (TAL(HCUM+H,L) > Y) THEN
                 Y = TAL(HCUM+H,L)
                 C(J,L) = FLOAT(H)
               END IF
             END DO
             HCUM = HCUM + CAT(J)
           END IF
         END DO  ! next variable
       END DO  ! next cluster
         
!!!!!!!!!!!!!!!! experiment, show DEV
       IF (dump) THEN
         PRINT *, 'tclust: importance weights are: '
         DO J = 1, PCON
           PRINT 22, 1. / DEV(J)
         END DO 
         PRINT 23
       END IF
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!       
        
       DEALLOCATE (DEV, DS1, DS2, DT, Q, QOLD, W,
     $             DQ, S, R, U, V, STAT=IALL)
       IF (IALL .NE. 0) THEN
         PRINT *, 'tclust: trouble releasing memory!'
         IERR = 6
       END IF
       
       RETURN
      END ! of tclust
