Source code for Module.FN_Computation

# Yuncong Ma, 1/10/2024
# FN Computation module of pNet

#########################################
# Packages
import numpy
import numpy as np
import scipy
import scipy.io as sio
import os
import re
import time

# other functions of pNet
from Module.Data_Input import *


[docs]def mat_corr(X, Y=None, dataPrecision='double'):
    """
    mat_corr(X, Y=None, dataPrecision='double')
    Perform corr as in MATLAB, pair-wise Pearson correlation between columns in X and Y

    :param X: 1D or 2D matrix
    :param Y: 1D or 2D matrix, or None
    :param dataPrecision: 'double' or 'single'
    X and Y have the same number of rows
    :return: Corr

    Note: this method will use memory as it concatenates X and Y along column direction.
    By Yuncong Ma, 9/5/2023
    """

    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(X, np.ndarray):
        X = np.array(X, dtype=np_float)
    else:
        X = X.astype(np_float)
    if Y is not None:
        if not isinstance(Y, np.ndarray):
            Y = np.array(Y, dtype=np_float)
        else:
            Y = Y.astype(np_float)

    # Check size of X and Y
    if len(X.shape) > 2 or (Y is not None and len(Y.shape) > 2):
        raise ValueError("X and Y must be 1D or 2D matrices")
    if Y is not None and X.shape[0] != Y.shape[0]:
        raise ValueError("X and Y must have the same number of columns")

    dim_X = X.shape
    if Y is not None:
        dim_Y = Y.shape
        if len(dim_X) == 2 and len(dim_Y) == 2:
            temp = np.corrcoef(X, Y, rowvar=False)
            Corr = temp[0:dim_X[1], dim_X[1]:dim_X[1]+dim_Y[1]]
        elif len(dim_X) == 1 and len(dim_Y) == 2:
            temp = np.corrcoef(X, Y, rowvar=False)
            Corr = temp[0, 1:1+dim_Y[1]]
        elif len(dim_X) == 2 and len(dim_Y) == 1:
            temp = np.corrcoef(X, Y, rowvar=False)
            Corr = temp[1:1+dim_X[1], 0]
        else:
            temp = np.corrcoef(X, Y, rowvar=False)
            Corr = temp[0, 1]
    else:
        if len(X.shape) != 2:
            raise ValueError("X must be a 2D matrix")
        Corr = np.corrcoef(X, rowvar=False)

    return Corr


[docs]def normalize_data(data, algorithm='vp', normalization='vmax', dataPrecision='double'):
    """
    normalize_data(data, algorithm='vp', normalization='vmax', dataPrecision='double')
    Normalize data by algorithm and normalization settings

    :param data: data in 2D matrix [dim_time, dim_space], recommend to use reference mode to save memory
    :param algorithm: 'z' 'gp' 'vp'
    :param normalization: 'n2' 'n1' 'rn1' 'g' 'vmax'
    :param dataPrecision: 'double' or 'single'
    :return: data

    Consistent to MATLAB function normalize_data(X, algorithm, normalization, dataPrecision)
    By Yuncong Ma, 9/8/2023
    """

    if len(data.shape) != 2:
        raise ValueError("Data must be a 2D matrix")

    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(data, np.ndarray):
        data = np.array(data, dtype=np_float)
    else:
        data = data.astype(np_float)

    if algorithm.lower() == 'z':
        # standard score for each variable
        mVec = np.mean(data, axis=1)
        sVec = np.maximum(np.std(data, axis=1), np_eps)
        data = (data - mVec[:, np.newaxis]) / sVec[:, np.newaxis]
    elif algorithm.lower() == 'gp':
        # remove negative value globally
        minVal = np.min(data)
        shiftVal = np.abs(np.minimum(minVal, 0))
        data = data + shiftVal
    elif algorithm.lower() == 'vp':
        # remove negative value voxel-wisely
        minVal = np.min(data, axis=0, keepdims=True)
        shiftVal = np.abs(np.minimum(minVal, 0))
        data += shiftVal
    else:
        # do nothing
        data = data

    if normalization.lower() == 'n2':
        # l2 normalization for each observation
        l2norm = np.sqrt(np.sum(data ** 2, axis=1)) + np_eps
        data = data / l2norm[:, np.newaxis]
    elif normalization.lower() == 'n1':
        # l1 normalization for each observation
        l1norm = np.sum(data, axis=1) + np_eps
        data = data / l1norm[:, np.newaxis]
    elif normalization.lower() == 'rn1':
        # l1 normalization for each variable
        l1norm = np.sum(data, axis=0) + np_eps
        data = data / l1norm
    elif normalization.lower() == 'g':
        # global scale
        sVal = np.sort(data, axis=None)
        perT = 0.001
        minVal = sVal[int(len(sVal) * perT)]
        maxVal = sVal[int(len(sVal) * (1 - perT))]
        data[data < minVal] = minVal
        data[data > maxVal] = maxVal
        data = (data - minVal) / max((maxVal - minVal), np_eps)
    elif normalization.lower() == 'vmax':
        cmin = np.min(data, axis=0, keepdims=True)
        cmax = np.max(data, axis=0, keepdims=True)
        data = (data - cmin) / np.maximum(cmax - cmin, np_eps)
    else:
        # do nothing
        data = data

    if np.isnan(data).any():
        raise ValueError('  nan exists, check the preprocessed data')

    return data


[docs]def initialize_u(X, U0, V0, error=1e-4, maxIter=1000, minIter=30, meanFitRatio=0.1, initConv=1, dataPrecision='double'):
    """
    initialize_u(X, U0, V0, error=1e-4, maxIter=1000, minIter=30, meanFitRatio=0.1, initConv=1, dataPrecision='double')

    :param X: data, 2D matrix [dim_time, dim_space]
    :param U0: initial temporal component, 2D matrix [dim_time, k]
    :param V0: initial spatial component, 2D matrix [dim_space, k]
    :param error: data fitting error
    :param maxIter: maximum iteration
    :param minIter: minimum iteration
    :param meanFitRatio: a 0-1 scaler, exponential moving average coefficient
    :param initConv: 0 or 1, flag for convergence
    :param dataPrecision: 'double' or 'single'
    :return: U_final: temporal components of FNs, a 2D matrix [dim_time, K]

    Consistent to MATLAB function initialize_u(X, U0, V0, error, maxIter, minIter, meanFitRatio, initConv, dataPrecision)
    By Yuncong Ma, 9/5/2023
    """

    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(X, np.ndarray):
        X = np.array(X, dtype=np_float)
    else:
        X = X.astype(np_float)
    if not isinstance(U0, np.ndarray):
        U0 = np.array(U0, dtype=np_float)
    else:
        U0 = U0.astype(np_float)
    if not isinstance(V0, np.ndarray):
        V0 = np.array(V0, dtype=np_float)
    else:
        V0 = V0.astype(np_float)

    # Check the data size of X, U0 and V0
    if len(X.shape) != 2 or len(U0.shape) != 2 or len(V0.shape) != 2:
        raise ValueError("X, U0 and V0 must be 2D matrices")
    if X.shape[0] != U0.shape[0] or X.shape[1] != V0.shape[0] or U0.shape[1] != V0.shape[1]:
        raise ValueError("X, U0 and V0 need to have appropriate sizes")

    # Duplicate a copy for iterative update of U and V
    U = U0.copy()
    V = V0.copy()

    newFit = data_fitting_error(X, U, V, 0, 1, dataPrecision)
    meanFit = newFit / meanFitRatio

    maxErr = 1
    for i in range(1, maxIter+1):
        # update U with fixed V
        XV = X @ V
        VV = V.T @ V
        UVV = U @ VV

        U = U * (XV / np.maximum(UVV, np_eps))

        if i > minIter:
            if initConv:
                newFit = data_fitting_error(X, U, V, 0, 1, dataPrecision)
                meanFit = meanFitRatio * meanFit + (1 - meanFitRatio) * newFit
                maxErr = (meanFit - newFit) / meanFit

        if maxErr <= error:
            break

    U_final = U
    return U_final


[docs]def data_fitting_error(X, U, V, deltaVU=0, dVordU=1, dataPrecision='double'):
    """
    data_fitting_error(X, U, V, deltaVU, dVordU, dataPrecision='double')
    Calculate the datat fitting of X'=UV' with terms

    :param X: 2D matrix, [Space, Time]
    :param U: 2D matrix, [Time, k]
    :param V: 2D matrix, [Space, k]
    :param deltaVU: 0
    :param dVordU: 1
    :param dataPrecision: 'double' or 'single'
    :return: Fitting_Error

    Consistent to MATLAB function fitting_initialize_u(X, U, V, deltaVU, dVordU, dataPrecision)
    By Yuncong Ma, 9/6/2023
    """

    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(X, np.ndarray):
        X = np.array(X, dtype=np_float)
    else:
        X = X.astype(np_float)
    if not isinstance(U, np.ndarray):
        U = np.array(U, dtype=np_float)
    else:
        U = U.astype(np_float)
    if not isinstance(V, np.ndarray):
        V = np.array(V, dtype=np_float)
    else:
        V = V.astype(np_float)

    # Check data size of X, U and V
    if len(X.shape) != 2 or len(U.shape) != 2 or len(V.shape) != 2:
        raise ValueError("X, U and V must be 2D matrices")
    if X.shape[0] != U.shape[0] or X.shape[1] != V.shape[0] or U.shape[1] != V.shape[1]:
        raise ValueError("X, U and V need to have appropriate sizes")

    dV = []
    maxM = 62500000  # To save memory
    dim_time, dim_space = X.shape
    mn = np.prod(X.shape)
    nBlock = int(np.floor(mn*3/maxM))

    if mn < maxM:
        dX = U @ V.T - X
        obj_NMF = np.sum(np.power(dX, 2))
        if deltaVU:
            if dVordU:
                dV = dX.T * U
            else:
                dV = dX * V
    else:
        obj_NMF = 0
        if deltaVU:
            if dVordU:
                dV = np.zeros_like(V)
            else:
                dV = np.zeros_like(U)
        for i in range(int(np.ceil(dim_space/nBlock))):
            if i == int(np.ceil(dim_space/nBlock)):
                smpIdx = range(i*nBlock, dim_space)
            else:
                smpIdx = range(i*nBlock, np.minimum(dim_space, (i+1)*nBlock))
            dX = (U @ V[smpIdx, :].T) - X[:, smpIdx]
            obj_NMF += np.sum(np.power(dX, 2))
            if deltaVU:
                if dVordU:
                    dV[smpIdx, :] = np.dot(dX.T, U)
                else:
                    dV += np.dot(dX, V[smpIdx, :])
        if deltaVU:
            if dVordU:
                dV = dV

    Fitting_Error = obj_NMF
    return Fitting_Error


[docs]def normalize_u_v(U, V, NormV, Norm, dataPrecision='double'):
    """
    normalize_u_v(U, V, NormV, Norm, dataPrecision='double')
    Normalize U and V with terms

    :param U: 2D matrix, [Time, k]
    :param V: 2D matrix, [Space, k]
    :param NormV: 1 or 0
    :param Norm: 1 or 2
    :param dataPrecision: 'double' or 'single'
    :return: U, V

    Consistent to MATLAB function normalize_u_v(U, V, NormV, Norm, dataPrecision)
    By Yuncong Ma, 9/5/2023
    """

    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(U, np.ndarray):
        U = np.array(U, dype=np_float)
    else:
        U = U.astype(np_float)
    if not isinstance(V, np.ndarray):
        V = np.array(V, dype=np_float)
    else:
        V = V.astype(np_float)

    # Check data size of U and V
    if len(U.shape) != 2 or len(V.shape) != 2:
        raise ValueError("U and V must be 2D matrices")
    if U.shape[1] != V.shape[1]:
        raise ValueError("U and V need to have appropriate sizes")

    dim_space = V.shape[0]
    dim_time = U.shape[0]

    if Norm == 2:
        norms = np.sqrt(np.sum(np.power(V, 2), axis=0))
        norms = np.maximum(norms, np_eps)
    else:
        norms = np.max(V, axis=0)
        norms = np.maximum(norms, np_eps)

    if NormV:
        U = U * np.tile(norms, (dim_time, 1))
        V = V / np.tile(norms, (dim_space, 1))
    else:
        U = U / np.tile(norms, (dim_time, 1))
        V = V * np.tile(norms, (dim_space, 1))

    return U, V


[docs]def construct_Laplacian_gNb(gNb: np.ndarray, dim_space, vxI=0, X=None, alphaL=10, normW=1, dataPrecision='double'):
    """
    construct_Laplacian_gNb(gNb: np.ndarray, dim_space, vxI=0, X=None, alphaL=10, normW=1, dataPrecision='double')
    construct Laplacian matrices for Laplacian spatial regularization term

    :param gNb: graph neighborhood, a 2D matrix [N, 2] storing rows and columns of non-zero elements
    :param dim_space: dimension of space (number of voxels or vertices)
    :param vxI: 0 or 1, flag for using the temporal correlation between nodes (vertex, voxel)
    :param X: fMRI data, a 2D matrix, [dim_time, dim_space]
    :param alphaL: internal hyper parameter for Laplacian regularization term
    :param normW: 1 or 2, normalization method for Laplacian matrix W
    :param dataPrecision: 'double' or 'single'
    :return: L, W, D: sparse 2D matrices [dim_space, dim_space]

    Yuncong Ma, 9/13/2023
    """

    np_float, np_eps = set_data_precision(dataPrecision)

    # Construct the spatial affinity graph
    # gNb uses index starting from 1
    W = scipy.sparse.coo_matrix((np.ones(gNb.shape[0]), (gNb[:, 0] - 1, gNb[:, 1] - 1)), shape=(dim_space, dim_space), dtype=np_float)
    if vxI > 0:
        for i in range(gNb.shape[0]):
            xi = gNb[i, 0] - 1
            yi = gNb[i, 1] - 1
            if xi < yi:
                corrVal = (1.0 + mat_corr(X[:, xi], X[:, yi], dataPrecision)) / 2
                W[xi, yi] = corrVal
                W[yi, xi] = corrVal

    # Defining other matrices
    DCol = np.array(W.sum(axis=1), dtype=np_float).flatten()
    D = scipy.sparse.spdiags(DCol, 0, dim_space, dim_space)
    L = D - W
    if normW > 0:
        D_mhalf = scipy.sparse.spdiags(np.power(DCol, -0.5), 0, dim_space, dim_space)
        L = D_mhalf @ L @ D_mhalf * alphaL
        W = D_mhalf @ W @ D_mhalf * alphaL
        D = D_mhalf @ D @ D_mhalf * alphaL

    return L, W, D

[docs]def robust_normalize_V(V, factor = 0.95, dataPrecision='double'):
    # Setup data precision and eps
    np_float, np_eps = set_data_precision(dataPrecision)
    # assume V is the FNs with its first dim as spatial one
    #robust normalization of V
    sdim, fdim = V.shape
    if sdim < fdim:
        print('\n  the input V has wrong dimension.' + str(sdim) + str(fdim) + '\n')
    vtop = np.percentile(V, 1.0-factor, axis=0, keepdims=True)
    vbottome = np.percentile(V, factor, axis=0, keepdims=True)
    vdiff = np.maximum(vtop-vbottome, np_eps)
    #print(vdiff)
    #print(torch.tile(vdiff, (V.shape[0], 1)).shape)
    V = np.clip( (V - np.tile(vbottome, (V.shape[0],1))) / np.tile(vdiff, (V.shape[0], 1)), 0.0, 1.0)
    return V

[docs]def pFN_NMF(Data, gFN, gNb, maxIter=1000, minIter=30, meanFitRatio=0.1, error=1e-4, normW=1,
            Alpha=2, Beta=30, alphaS=0, alphaL=0, vxI=0, initConv=1, ard=0, eta=0, dataPrecision='double', logFile='Log_pFN_NMF.log'):
    """
    pFN_NMF(Data, gFN, gNb, maxIter=1000, minIter=30,
            meanFitRatio=0.1, error=1e-4, normW=1,
            Alpha=2, Beta=30, alphaS=2, alphaL=10, initConv=1, ard=0, eta=0,
            dataPrecision='double', logFile='Log_pFN_NMF.log')
    Compute personalized FNs by spatially-regularized NMF method with group FNs as initialization

    :param Data: 2D matrix [dim_time, dim_space]. Data will be formatted to Tensor and normalized.
    :param gFN: group level FNs 2D matrix [dim_space, K], K is the number of functional networks. gFN will be cloned
    :param gNb: graph neighborhood, a 2D matrix [N, 2] storing rows and columns of non-zero elements
    :param maxIter: maximum iteration number for multiplicative update
    :param minIter: minimum iteration in case fast convergence
    :param meanFitRatio: a 0-1 scaler, exponential moving average coefficient, used for the initialization of U when using group initialized V
    :param error: difference of cost function for convergence
    :param normW: 1 or 2, normalization method for W used in Laplacian regularization
    :param Alpha: hyper parameter for spatial sparsity
    :param Beta: hyper parameter for Laplacian sparsity
    :param alphaS: internally determined, the coefficient for spatial sparsity based Alpha, data size, K, and gNb
    :param alphaL: internally determined, the coefficient for Laplacian sparsity based Beta, data size, K, and gNb
    :param vxI: 0 or 1, flag for using the temporal correlation between nodes (vertex, voxel)
    :param initConv: flag for convergence of initialization of U
    :param ard: 0 or 1, flat for combining similar clusters
    :param eta: a hyper parameter for the ard regularization term
    :param dataPrecision: 'single' or 'float32', 'double' or 'float64'
    :param logFile: str, directory of a txt log file
    :return: U and V. U is the temporal components of pFNs, a 2D matrix [dim_time, K], and V is the spatial components of pFNs, a 2D matrix [dim_space, K]

    Yuncong Ma, 12/13/2023
    """

    # Setup data precision and eps
    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(gFN, np.ndarray):
        gFN = np.array(gFN, dype=np_float)
    else:
        gFN = gFN.astype(np_float)
    if not isinstance(Data, np.ndarray):
        Data = np.array(Data, dype=np_float)
    else:
        Data = Data.astype(np_float)

    # check dimension of Data and gFN
    if Data.shape[1] != gFN.shape[0]:
        raise ValueError("The second dimension of Data should match the first dimension of gFn, as they are space dimension")

    K = gFN.shape[1]

    # setup log file
    logFile = open(logFile, 'a')
    print(f'\nStart NMF for pFN using NumPy at '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    # initialization
    initV = gFN.copy()

    dim_time, dim_space = Data.shape

    # Median number of graph neighbors
    nM = np.median(np.unique(gNb[:, 0], return_counts=True)[1])

    # Use Alpha and Beta to set alphaS and alphaL if they are 0
    if alphaS == 0 and Alpha > 0:
        alphaS = np.round(Alpha * dim_time / K)
    if alphaL == 0 and Beta > 0:
        alphaL = np.round(Beta * dim_time / K / nM)

    # Prepare and normalize scan
    Data = normalize_data(Data, 'vp', 'vmax', dataPrecision=dataPrecision)
    X = Data    # Save memory

    # Construct the spatial affinity graph
    L, W, D = construct_Laplacian_gNb(gNb, dim_space, vxI, X, alphaL, normW, dataPrecision)

    # Initialize V
    V = np.copy(initV)
    miv = np.max(V, axis=0)
    trimInd = V / np.maximum(np.tile(miv, (dim_space, 1)), np_eps) < 5e-2
    V[trimInd] = 0
    
    # robust normalization of V  added on 08/03/2024
    #V = robust_normalize_V(V, factor=1.0 / K)  # try to keep 1/K non-zero values for each component, updated on 08/03/2024

    # Initialize U
    U = X @ V / np.tile(np.sum(V, axis=0), (dim_time, 1))

    U = initialize_u(X, U, V, error=error, maxIter=100, minIter=30, meanFitRatio=meanFitRatio, initConv=initConv, dataPrecision=dataPrecision)

    initV = V.copy()

    # Alternative update of U and V
    # Variables

    if ard > 0:
        lambdas = np.sum(U, axis=0) / dim_time
        hyperLam = eta * np.sum(np.power(X, 2)) / (dim_time * dim_space * 2)
    else:
        lambdas = 0
        hyperLam = 0

    flagQC = 0
    oldLogL = np.inf
    oldU = U.copy()
    oldV = V.copy()
    #  Multiplicative update of U and V
    for i in range(1, 1+maxIter):
        # ===================== update V ========================
        # Eq. 8-11
        XU = X.T @ U
        UU = U.T @ U
        VUU = V @ UU

        tmpl2 = np.power(V, 2)

        if alphaS > 0:
            tmpl21 = np.sqrt(tmpl2)
            tmpl22 = np.tile(np.sqrt(np.sum(tmpl2, axis=0)), (dim_space, 1))
            tmpl21s = np.tile(np.sum(tmpl21, axis=0), (dim_space, 1))
            posTerm = V / np.maximum(tmpl21 * tmpl22, np_eps)
            negTerm = V * tmpl21s / np.maximum(np.power(tmpl22, 3), np_eps)

            VUU = VUU + 0.5 * alphaS * posTerm
            XU = XU + 0.5 * alphaS * negTerm

        if alphaL > 0:
            WV = W @ V.astype(np.float64)
            DV = D @ V.astype(np.float64)

            XU = XU + WV
            VUU = VUU + DV

        V = V * (XU / np.maximum(VUU, np_eps))

        # Prune V if empty components are found in V
        # This is almost impossible to happen without combining FNs
        prunInd = np.sum(V != 0, axis=0) == 1
        if np.any(prunInd):
            V[:, prunInd] = np.zeros((dim_space, np.sum(prunInd)), dype=np_float)
            U[:, prunInd] = np.zeros((dim_time, np.sum(prunInd)), dype=np_float)

        # normalize U and V
        U, V = normalize_u_v(U, V, 1, 1, dataPrecision=dataPrecision)

        # ===================== update U =========================
        XV = X @ V
        VV = V.T @ V
        UVV = U @ VV

        if ard > 0:  # ard term for U
            posTerm = 1 / np.maximum(np.tile(lambdas, (dim_time, 1)), np_eps)

            UVV = UVV + posTerm * hyperLam

        U = U * (XV / np.maximum(UVV, np_eps))

        # Prune U if empty components are found in U
        # This is almost impossible to happen without combining FNs
        prunInd = np.sum(U, axis=0) == 0
        if np.any(prunInd):
            V[:, prunInd] = np.zeros((dim_space, np.sum(prunInd)), dype=np_float)
            U[:, prunInd] = np.zeros((dim_time, np.sum(prunInd)), dype=np_float)

        # update lambda
        if ard > 0:
            lambdas = np.sum(U, axis=0) / dim_time

        # ==== calculate objective function value ====
        ardU = 0
        tmp2 = 0
        tmpl21 = np.power(V, 2)

        if ard > 0:
            su = np.sum(U, axis=0)
            su[su == 0] = 1
            ardU = np.sum(np.log(su)) * dim_time * hyperLam

        if alphaL > 0:
            tmp2 = V.T @ L * V.T

        L21 = alphaS * np.sum(np.sum(np.sqrt(tmpl21), axis=0) / np.maximum(np.sqrt(np.sum(tmpl21, axis=0)), np_eps))
        # LDf = data_fitting_error(X, U, V, 0, 1)
        LDf = np.sum(np.power(X - U @ V.T, 2))
        LSl = np.sum(tmp2)

        # Objective function
        LogL = L21 + ardU + LDf + LSl
        print(f"    Iter = {i}: LogL: {LogL}, dataFit: {LDf}, spaLap: {LSl}, L21: {L21}, ardU: {ardU}", file=logFile)

        # The iteration needs to meet minimum iteration number and small changes of LogL
        if i > minIter and abs(oldLogL - LogL) / np.maximum(oldLogL, np_eps) < error:
            break
        oldLogL = LogL.copy()

        # QC Control
        temp = mat_corr(gFN, V, dataPrecision)
        QC_Spatial_Correspondence = np.copy(np.diag(temp))
        temp -= np.diag(2 * np.ones(K))  # set diagonal values to lower than -1
        QC_Spatial_Correspondence_Control = np.max(temp, axis=1)
        QC_Delta_Sim = np.min(QC_Spatial_Correspondence - QC_Spatial_Correspondence_Control)

        if QC_Delta_Sim <= 0:
            flagQC = 1
            U = oldU.copy()
            V = oldV.copy()
            print(f'\n  QC: Meet QC constraint: Delta sim = {QC_Delta_Sim}', file=logFile, flush=True)
            print(f'    Use results from last iteration', file=logFile, flush=True)
            break
        else:
            oldU = U.copy()
            oldV = V.copy()
            print(f'        QC: Delta sim = {QC_Delta_Sim}', file=logFile, flush=True)

    print(f'\n Finished at '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    return U, V


[docs]def gFN_NMF(Data, K, gNb, init='random', maxIter=1000, minIter=200, error=1e-8, normW=1,
            Alpha=2, Beta=30, alphaS=0, alphaL=0, vxI=0, ard=0, eta=0, nRepeat=5, dataPrecision='double', logFile='Log_pFN_NMF.log'):
    """
    gFN_NMF(Data, K, gNb, maxIter=1000, minIter=30, error=1e-8, normW=1,
            Alpha=2, Beta=30, alphaS=0, alphaL=0, vxI=0, ard=0, eta=0, nRepeat=5, dataPrecision='double', logFile='Log_pFN_NMF.log')
    Compute group-level FNs using NMF method

    :param Data: 2D matrix [dim_time, dim_space], recommend to normalize each fMRI scan before concatenate them along the time dimension
    :param K: number of FNs
    :param gNb: graph neighborhood, a 2D matrix [N, 2] storing rows and columns of non-zero elements
    :param init: 'nndsvda': NNDSVD with zeros filled with the average of X (better when sparsity is not desired)  #updated on 08/03/2024
                 'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components)
                 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) initialization (better for sparseness)
                 'nndsvdar' NNDSVD with zeros filled with small random values (generally faster, less accurate alternative to NNDSVDa for when sparsity is not desired)
    :param maxIter: maximum iteration number for multiplicative update
    :param minIter: minimum iteration in case fast convergence
    :param error: difference of cost function for convergence
    :param normW: 1 or 2, normalization method for W used in Laplacian regularization
    :param Alpha: hyper parameter for spatial sparsity
    :param Beta: hyper parameter for Laplacian sparsity
    :param alphaS: internally determined, the coefficient for spatial sparsity based Alpha, data size, K, and gNb
    :param alphaL: internally determined, the coefficient for Laplacian sparsity based Beta, data size, K, and gNb
    :param vxI: flag for using the temporal correlation between nodes (vertex, voxel)
    :param ard: 0 or 1, flat for combining similar clusters
    :param eta: a hyper parameter for the ard regularization term
    :param nRepeat: Any positive integer, the number of repetition to avoid poor initialization
    :param dataPrecision: 'single' or 'float32', 'double' or 'float64'
    :param logFile: str, directory of a txt log file
    :return: gFN, 2D matrix [dim_space, K]

    Yuncong Ma, 11/27/2023
    """

    # setup log file
    if isinstance(logFile, str):
        logFile = open(logFile, 'a')
    print(f'\nStart NMF for gFN using NumPy at '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    # Setup data precision and eps
    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(Data, np.ndarray):
        Data = np.array(Data, dype=np_float)
    else:
        Data = Data.astype(np_float)

    # Input data size
    dim_time, dim_space = Data.shape

    # Median number of graph neighbors
    nM = np.median(np.unique(gNb[:, 0], return_counts=True)[1])

    # Use Alpha and Beta to set alphaS and alphaL if they are 0
    if alphaS == 0 and Alpha > 0:
        alphaS = np.round(Alpha * dim_time / K)
    if alphaL == 0 and Beta > 0:
        alphaL = np.round(Beta * dim_time / K / nM)

    # Prepare and normalize scan
    Data = normalize_data(Data, 'vp', 'vmax', dataPrecision)
    X = Data  # Save memory

    # Construct the spatial affinity graph
    L, W, D = construct_Laplacian_gNb(gNb, dim_space, vxI, X, alphaL, normW, dataPrecision)

    #add initialization parameter on 08/03/2024
    # to use sklearn NMF
    # from sklearn.decomposition import NMF   # for sklearn based NMF initialization
    # model = NMF(n_components=K, init=init, max_iter=1, solver='mu')
    # model._check_params(X)
    # U = None
    # V = None
    # U, V = model._check_w_h(X, U, V,True)
    # return np.transpose(V)

    if init != 'random':
        # to use sklearn NMF  on Aug 07, 2024
        from sklearn.decomposition import NMF
        model = NMF(n_components=K, init=init, max_iter=20000) #, random_state=0)
        W = model.fit_transform(X)
        H = model.components_
        return np.transpose(H)

    flag_Repeat = 0
    for repeat in range(1, 1 + nRepeat):
        flag_Repeat = 0
        print(f'\n Starting {repeat}-th repetition\n', file=logFile, flush=True)

        # Initialize U and V
        mean_X = np.sum(X) / (dim_time*dim_space)
        U = (np.random.rand(dim_time, K) + 1) * (np.sqrt(mean_X/K))
        V = (np.random.rand(dim_space, K) + 1) * (np.sqrt(mean_X/K))

        # Normalize data
        U, V = normalize_u_v(U, V, 1, 1, dataPrecision)

        if ard > 0:
            ard = 1
            eta = 0.1
            lambdas = np.sum(U, axis=0) / dim_time
            hyperLam = eta * np.sum(np.power(X, 2)) / (dim_time * dim_space * 2)
        else:
            lambdas = 0
            hyperLam = 0

        oldLogL = np.inf

        # Multiplicative update of U and V
        for i in range(1, 1+maxIter):
            # ===================== update V ========================
            # Eq. 8-11
            XU = X.T @ U
            UU = U.T @ U
            VUU = V @ UU

            tmpl2 = np.power(V, 2)

            if alphaS > 0:
                tmpl21 = np.sqrt(tmpl2)
                tmpl22 = np.tile(np.sqrt(np.sum(tmpl2, axis=0)), (dim_space, 1))
                tmpl21s = np.tile(np.sum(tmpl21, axis=0), (dim_space, 1))
                posTerm = V / np.maximum(tmpl21 * tmpl22, np_eps)
                negTerm = V * tmpl21s / np.maximum(np.power(tmpl22, 3), np_eps)

                VUU = VUU + 0.5 * alphaS * posTerm
                XU = XU + 0.5 * alphaS * negTerm

            if alphaL > 0:
                WV = W @ V.astype(np.float64)
                DV = D @ V.astype(np.float64)

                XU = XU + WV
                VUU = VUU + DV

            V = V * (XU / np.maximum(VUU, np_eps))

            # Prune V if empty components are found in V
            # This is almost impossible to happen without combining FNs
            prunInd = np.sum(V != 0, axis=0) == 1
            if np.any(prunInd):
                V[:, prunInd] = np.zeros((dim_space, np.sum(prunInd)))
                U[:, prunInd] = np.zeros((dim_time, np.sum(prunInd)))

            # normalize U and V
            U, V = normalize_u_v(U, V, 1, 1)

            # ===================== update U =========================
            XV = X @ V
            VV = V.T @ V
            UVV = U @ VV

            if ard > 0:  # ard term for U
                posTerm = 1 / np.maximum(np.tile(lambdas, (dim_time, 1)), np_eps)

                UVV = UVV + posTerm * hyperLam

            U = U * (XV / np.maximum(UVV, np_eps))

            # Prune U if empty components are found in U
            # This is almost impossible to happen without combining FNs
            prunInd = np.sum(U, axis=0) == 0
            if np.any(prunInd):
                V[:, prunInd] = np.zeros((dim_space, np.sum(prunInd)))
                U[:, prunInd] = np.zeros((dim_time, np.sum(prunInd)))

            # update lambda
            if ard > 0:
                lambdas = np.sum(U) / dim_time

            # ==== calculate objective function value ====
            ardU = 0
            tmp2 = 0
            tmpl21 = np.power(V, 2)

            if ard > 0:
                su = np.sum(U, axis=0)
                su[su == 0] = 1
                ardU = np.sum(np.log(su)) * dim_time * hyperLam

            if alphaL > 0:
                tmp2 = V.T @ L * V.T

            L21 = alphaS * np.sum(np.sum(np.sqrt(tmpl21), axis=0) / np.maximum(np.sqrt(np.sum(tmpl21, axis=0)), np_eps))
            # LDf = data_fitting_error(X, U, V, 0, 1)
            LDf = np.sum(np.power(X - U @ V.T, 2))
            LSl = np.sum(tmp2)

            # Objective function
            LogL = L21 + LDf + LSl + ardU
            print(f"    Iter = {i}: LogL: {LogL}, dataFit: {LDf}, spaLap: {LSl}, L21: {L21}, ardU: {ardU}", file=logFile, flush=True)

            # The iteration needs to meet minimum iteration number and small changes of LogL
            if 1 < i < minIter and abs(oldLogL - LogL) / np.maximum(oldLogL, np_eps) < error:
                flag_Repeat = 1
                print('\n Iteration stopped before the minimum iteration number. The results might be poor.\n', file=logFile, flush=True)
                break
            elif i > minIter and abs(oldLogL - LogL) / np.maximum(oldLogL, np_eps) < error:
                break
            oldLogL = LogL.copy()
        if flag_Repeat == 0:
            break

    if flag_Repeat == 1:
        print('\n All repetition stopped before the minimum iteration number. The final results might be poor\n', file=logFile, flush=True)

    gFN = V
    print(f'\nFinished at '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    return gFN


[docs]def gFN_fusion_NCut(gFN_BS, K, NCut_MaxTrial=100, dataPrecision='double', logFile='Log_gFN_fusion_NCut'):
    """
    gFN_fusion_NCut(gFN_BS, K, NCut_MaxTrial=100, dataPrecision='double')
    Fuses FN results to generate representative group-level FNs

    :param gFN_BS: FNs obtained from bootstrapping method, FNs are concatenated along the K dimension
    :param K: Number of FNs, not the total number of FNs obtained from bootstrapping
    :param NCut_MaxTrial: Max number trials for NCut method
    :param dataPrecision: 'double' or 'single'
    :param logFile: str, directory of a txt log file
    :return: gFNs, 2D matrix [dim_space, K]

    Yuncong Ma, 10/2/2023
    """

    # setup log file
    if isinstance(logFile, str):
        logFile = open(logFile, 'a')
    print(f'\nStart NCut for gFN fusion using NumPy at '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    # Setup data precision and eps
    np_float, np_eps = set_data_precision(dataPrecision)
    if not isinstance(gFN_BS, np.ndarray):
        gFN_BS = np.array(gFN_BS, dype=np_float)
    else:
        gFN_BS = gFN_BS.astype(np_float)

    # clustering by NCut

    # Get similarity between samples
    corrVal = mat_corr(gFN_BS, dataPrecision=dataPrecision)  # similarity between FNs, [K * n_BS, K * n_BS]
    corrVal[np.isnan(corrVal)] = -1
    nDis = 1 - corrVal  # Transform Pearson correlation to non-negative values similar to distance
    triuInd = np.triu(np.ones(nDis.shape), 1)  # Index of upper triangle
    nDisVec = nDis[triuInd == 1]  # Take the values in upper triangle
    # Make all distance non-negative and normalize their distribution
    nW = np.exp(-np.power(nDis, 2) / (np.power(np.median(nDisVec), 2)))  # Transform distance values using exp(-X/std^2) with std as the median value
    nW[np.isnan(nW)] = 0
    sumW = np.sum(nW, axis=1)  # total distance for each FN
    sumW[sumW == 0] = 1  # In case two FNs are the same
    # Construct Laplacian matrix
    D = np.diag(sumW)
    L = np.sqrt(np.linalg.inv(D)) @ nW @ np.linalg.inv(np.sqrt(D))  # A way to normalize nW based on the total distance of each FN
    L = (L + L.T) / 2  # Ensure L is symmetric. Computation error may result in asymmetry

    eVal, Ev = scipy.sparse.linalg.eigs(L.astype(np.float64), K, which='LR')  # Get first K eigenvectors, sign of vectors may be different to MATLAB results
    Ev = np.real(Ev)
    # Correct the sign of eigenvectors to make them same as derived from MATLAB
    temp = np.sign(np.sum(Ev, axis=0))  # Use the total value of each eigenvector to reset its sign
    temp[temp == 0.0] = 1.0
    Ev = Ev * np.tile(temp, (Ev.shape[0], 1))  # Reset the sign of each eigenvector
    normvect = np.sqrt(np.diag(Ev @ Ev.T))  # Get the norm of each eigenvector
    normvect[normvect == 0.0] = 1  # Incase all 0 eigenvector
    Ev = np.linalg.solve(np.diag(normvect), Ev)  # Use linear solution to normalize Ev satisfying normvect * Ev_new = Ev_old

    # Multiple trials to get reproducible results
    Best_C = []
    Best_NCutValue = np.inf
    for i in range(NCut_MaxTrial):
        print(f'    Iter = ' + str(i), file=logFile)
        EigenVectors = Ev
        n, k = EigenVectors.shape  # n is K * n_BS, k is K

        vm = np.sqrt(np.sum(EigenVectors**2, axis=1, keepdims=True))  # norm of each row
        EigenVectors = EigenVectors / np.tile(vm, (1, k))  # normalize eigenvectors to ensure each FN vector's norm = 1

        R = np.zeros((k, k))
        ps = np.random.randint(0, n, 1)  # Choose a random row in eigenvectors
        R[:, 0] = EigenVectors[ps, :]  # This randomly selected row in eigenvectors is used as an initial center

        c_index = np.zeros(n)
        c = np.zeros(n)  # Total distance to different rows in R [K * n_BS, 1]
        c_index[0] = ps  # Store the index of selected samples
        c[ps] = np.inf

        for j in range(2, k+1):  # Find another K-1 rows in eigenvectors which have the minimum similarity to previous selected rows, similar to initialization in k++
            c += np.abs(EigenVectors @ R[:, j-2])
            ps = np.argmin(c)
            c_index[j-1] = ps
            c[ps] = np.inf
            R[:, j-1] = EigenVectors[ps, :].T

        lastObjectiveValue = 0
        exitLoop = 0
        nbIterationsDiscretisation = 0
        nbIterationsDiscretisationMax = 20

        while exitLoop == 0:
            nbIterationsDiscretisation += 1

            EigenVectorsR = EigenVectors @ R
            n, k = EigenVectorsR.shape
            J = np.argmax(EigenVectorsR, axis=1)  # Assign each sample to K centers of R based on highest similarity
            EigenvectorsDiscrete = scipy.sparse.csr_matrix((np.ones(n), (np.arange(n), J)), shape=(n, k))  # Generate a 0-1 matrix with each row containing only one 1

            U, S, Vh = scipy.linalg.svd(EigenvectorsDiscrete.T @ EigenVectors, full_matrices=False)  # Economy-size decomposition

            S = np.diag(S)  # To match MATLAB svd results
            V = Vh.T  # To match MATLAB svd results
            NcutValue = 2 * (n - np.trace(S))

            # escape the loop when converged or meet max iteration
            if abs(NcutValue - lastObjectiveValue) < np_eps or nbIterationsDiscretisation > nbIterationsDiscretisationMax:
                exitLoop = 1
                print(f'    Reach stop criterion of NCut, NcutValue = '+str(NcutValue)+'\n', file=logFile, flush=True)
            else:
                print(f'    NcutValue = '+str(NcutValue), file=logFile)
                lastObjectiveValue = NcutValue
                R = V @ U.T  # Update R which stores the new centers

        C = np.argmax(EigenvectorsDiscrete.toarray(), axis=1)  # Assign each sample to K centers in R

        if len(np.unique(C)) < K:  # Check whether there are empty results
            print(f'    Found empty results in iteration '+str(i+1)+'\n', file=logFile, flush=True)
        else:  # Update the best result
            if NcutValue < Best_NCutValue:
                Best_NCutValue = NcutValue
                Best_C = C

    if len(set(Best_C)) < K:  # In case even the last trial has empty results
        raise ValueError('  Cannot generate non-empty gFNs\n')
        Flag = 1
        Message = "Cannot generate non-empty FN"

    print(f'Best NCut value = '+str(Best_NCutValue)+'\n', file=logFile, flush=True)

    # Get centroid
    C = Best_C
    gFN = np.zeros((gFN_BS.shape[0], K))
    for ki in range(K):
        if np.sum(C == ki) > 1:
            candSet = gFN_BS[:, C == ki]  # Get the candidate set of FNs assigned to cluster ki
            corrW = np.abs(mat_corr(candSet, dataPrecision=dataPrecision))  # Get the similarity between candidate FNs
            corrW[np.isnan(corrW)] = 0
            mInd = np.argmax(np.sum(corrW, axis=0), axis=0)  # Find the FN with the highest total similarity to all other FNs
            gFN[:, ki] = candSet[:, mInd]
        elif np.sum(C == ki) == 1:
            mInd = C.tolist().index(ki)
            gFN[:, ki] = gFN_BS[:, mInd]

    gFN = gFN / np.maximum(np.tile(np.max(gFN, axis=0), (gFN.shape[0], 1)), np_eps)  # Normalize each FN by its max value
    print(f'\nFinished at '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    return gFN


[docs]def compute_gNb(Brain_Template, logFile=None):
    """
    compute_gNb(Brain_Template, logFile=None)
    Prepare a graph neighborhood variable, using indices as its sparse representation

    :param Brain_Template: a structure variable with keys 'Data_Type', 'Template_Format', 'Shape', 'Brain_Mask'.
        If Brain_Template.Data_Type is 'Surface', Shape contains L and R, with vertices and faces as sub keys. Brain_Mask contains L and R.
        If Brain_Template.Data_Type is 'Volume', Shape is None, Brain_Mask is a 3D 0-1 matrix, Overlay_Image is a 3D matrix
        If Brain_Template.Data_Type is 'Surface-Volume', It includes fields from both 'Surface' and 'Volume', 'Brain_Mask' is renamed to be 'Surface_Mask' and 'Volume_Mask'
    :param logFile:
    :return: gNb: a 2D matrix [N, 2], which labels the non-zero elements in a graph. Index starts from 1

    Yuncong Ma, 11/13/2023
    """

    # Check Brain_Template
    if 'Data_Type' not in Brain_Template.keys():
        raise ValueError('Cannot find Data_Type in the Brain_Template')
    if 'Template_Format' not in Brain_Template.keys():
        raise ValueError('Cannot find Data_Format in the Brain_Template')

    # Construct gNb
    if Brain_Template['Data_Type'] == 'Surface':
        Brain_Surface = Brain_Template
        # Number of vertices
        Nv_L = Brain_Surface['Shape']['L']['vertices'].shape[0]  # left hemisphere
        Nv_R = Brain_Surface['Shape']['R']['vertices'].shape[0]
        # Number of faces
        Nf_L = Brain_Surface['Shape']['L']['faces'].shape[0]  # left hemisphere
        Nf_R = Brain_Surface['Shape']['R']['faces'].shape[0]
        # Index of useful vertices, starting from 1
        vL = np.sort(np.where(Brain_Surface['Brain_Mask']['L'] == 1)[0]) + int(1)  # left hemisphere
        vR = np.sort(np.where(Brain_Surface['Brain_Mask']['R'] == 1)[0]) + int(1)
        # Create gNb using matrix format
        # Exclude the medial wall or other vertices outside the mask
        # Set the maximum size to avoid unnecessary memory allocation
        gNb_L = np.zeros((3 * Nf_L, 2), dtype=np.int64)
        gNb_R = np.zeros((3 * Nf_R, 2), dtype=np.int64)
        Count_L = 0
        Count_R = 0

        # Get gNb of all useful vertices in the left hemisphere
        for i in range(0, Nf_L):
            temp = Brain_Surface['Shape']['L']['faces'][i]
            temp = np.intersect1d(temp, vL)

            if len(temp) == 2:  # only one line
                gNb_L[Count_L, :] = temp
                Count_L += 1
            elif len(temp) == 3:  # three lines
                temp = np.tile(temp, (2, 1)).T
                temp[:, 1] = temp[(1, 2, 0), 1]
                gNb_L[Count_L:Count_L + 3, :] = temp
                Count_L += 3
            else:
                continue
        gNb_L = gNb_L[0:Count_L, :]  # Remove unused part
        # Right hemisphere
        for i in range(0, Nf_R):
            temp = Brain_Surface['Shape']['R']['faces'][i]
            temp = np.intersect1d(temp, vR)

            if len(temp) == 2:  # only one line
                gNb_R[Count_R, :] = temp
                Count_R += 1
            elif len(temp) == 3:  # three lines
                temp = np.tile(temp, (2, 1)).T
                temp[:, 1] = temp[(1, 2, 0), 1]
                gNb_R[Count_R:Count_R + 3, :] = temp
                Count_R += 3
            else:
                continue
        gNb_R = gNb_R[0:Count_R, :]  # Remove unused part

        # Map the index from all vertices to useful ones
        mapL = np.zeros(Nv_L, dtype=np.int64)
        mapL[vL - 1] = range(1, 1+len(vL))  # Python index starts from 0, gNb index starts from 1
        gNb_L = mapL[(gNb_L.flatten() - 1).astype(int)]
        gNb_L = np.reshape(gNb_L, (int(np.round(len(gNb_L)/2)), 2))
        gNb_L = np.append(gNb_L, gNb_L[:, (-1, 0)], axis=0)
        # right hemisphere
        mapR = np.zeros(Nv_R, dtype=np.int64)
        mapR[vR - 1] = range(1, 1+len(vR))
        gNb_R = mapR[(gNb_R.flatten() - 1).astype(int)]
        gNb_R = np.reshape(gNb_R, (int(np.round(len(gNb_R)/2)), 2))
        gNb_R = np.append(gNb_R, gNb_R[:, (-1, 0)], axis=0)

        # Combine two hemispheres
        gNb = np.concatenate((gNb_L, gNb_R + len(vL)), axis=0)  # Shift index in right hemisphere by the number of useful vertices in left hemisphere
        gNb = np.unique(gNb, axis=0)  # Remove duplicated

    elif Brain_Template['Data_Type'] == 'Volume':
        Brain_Mask = Brain_Template['Brain_Mask'] > 0
        if not (len(Brain_Mask.shape) == 3 or (len(Brain_Mask.shape) == 4 and Brain_Mask.shape[3] == 1)):
            raise ValueError('Mask in Brain_Template needs to be a 3D matrix when the data type is volume')
        if len(Brain_Mask.shape) == 4:
            Brain_Mask = np.squeeze(Brain_Mask, axis=3)

        # Index starts from 1
        sx = Brain_Mask.shape[0]
        sy = Brain_Mask.shape[1]
        sz = Brain_Mask.shape[2]
        # Label non-zero elements in Brain_Mask
        Nm = np.sum(Brain_Mask > 0)
        maskLabel = Brain_Mask.flatten('F')
        maskLabel = maskLabel.astype(np.int64)  # Brain_Mask might be a 0-1 logic matrix
        if 'Volume_Order' in Brain_Template.keys():  # customized index order
            maskLabel[maskLabel > 0] = Brain_Template['Volume_Order']
        else:  # default index order
            maskLabel[maskLabel > 0] = range(1, 1 + Nm)
        maskLabel = np.reshape(maskLabel, Brain_Mask.shape, order='F')  # consistent to MATLAB matrix index order
        # Enumerate each voxel in Brain_Mask
        # The following code is optimized for NumPy array
        # Set the maximum size to avoid unnecessary memory allocation
        gNb = np.zeros((Nm * 26, 2), dtype=np.int64)
        Count = 0
        for xi in range(sx):
            for yi in range(sy):
                for zi in range(sz):
                    if Brain_Mask[xi, yi, zi] > 0:
                        Brain_Mask[xi, yi, zi] = 0  # Exclude the self point
                        #  Create a 3x3x3 box, +2 is for Python range
                        patchBox = (np.maximum((xi - 1, yi - 1, zi - 1), (0, 0, 0)), np.minimum((xi + 2, yi + 2, zi + 2), (sx, sy, sz)))
                        for xni in range(patchBox[0][0], patchBox[1][0]):
                            for yni in range(patchBox[0][1], patchBox[1][1]):
                                for zni in range(patchBox[0][2], patchBox[1][2]):
                                    if Brain_Mask[xni, yni, zni] > 0:
                                        gNb[Count, :] = (maskLabel[xi, yi, zi], maskLabel[xni, yni, zni])
                                        Count += 1
                        Brain_Mask[xi, yi, zi] = 1  # reset the self value

        gNb = gNb[0:Count, :]  # Remove unused space
        gNb = np.unique(gNb, axis=0)  # Remove duplicated, and sort the results

    elif Brain_Template['Data_Type'] == 'Surface-Volume':
        Brain_Template_surf = Brain_Template.copy()
        Brain_Template_surf['Data_Type'] = 'Surface'
        Brain_Template_surf['Brain_Mask'] = Brain_Template_surf['Surface_Mask']
        gNb_surf = compute_gNb(Brain_Template_surf, logFile=logFile)
        Brain_Template_vol = Brain_Template.copy()
        Brain_Template_vol['Data_Type'] = 'Volume'
        Brain_Template_vol['Brain_Mask'] = Brain_Template_surf['Volume_Mask']
        gNb_vol = compute_gNb(Brain_Template_vol, logFile=logFile)
        # Concatenate the two gNbs with index adjustment
        gNb = np.concatenate((gNb_surf, gNb_vol + np.max(gNb_surf)), axis=0)

    else:
        raise ValueError('Unknown combination of Data_Type and Data_Surface: ' + Brain_Template['Data_Type'] + ' : ' + Brain_Template['Data_Format'])

    if len(np.unique(gNb[:, 0])) != max(np.unique(gNb[:, 0])):
        if logFile is not None:
            if isinstance(logFile, str):
                logFile = open(logFile, 'a')
            print('\ngNb contains isolated voxel or vertex which will affect the subsequent analysis', file=logFile, flush=True)

    if logFile is not None:
        print('\ngNb is generated successfully', file=logFile, flush=True)

    return gNb


[docs]def bootstrap_scan(dir_output: str, file_scan: str, file_subject_ID: str, file_subject_folder: str, file_group_ID=None, combineScan=0,
                   samplingMethod='Subject', sampleSize=10, nBS=50, logFile=None):
    """
    bootstrap_scan(dir_output: str, file_scan: str, file_subject_ID: str, file_subject_folder: str, file_group=None, combineScan=0, samplingMethod='Subject', sampleSize=10, nBS=50, logFile=None)
    prepare bootstrapped scan file lists

    :param dir_output: directory of a folder to store bootstrapped files
    :param file_scan: a txt file that stores directories of all fMRI scans
    :param file_subject_ID: a txt file that store subject ID information corresponding to fMRI scan in file_scan
    :param file_subject_folder: a txt file that store subject folder names corresponding to fMRI scans in file_scan
    :param file_group_ID: a txt file that store group information corresponding to fMRI scan in file_scan
    :param combineScan: 0 or 1, whether to combine multiple fMRI scans for each subject
    :param samplingMethod: 'Subject' or 'Group_Subject'. Uniform sampling based subject ID, or group and then subject ID
    :param sampleSize: number of subjects selected for each bootstrapping run
    :param nBS: number of runs for bootstrap
    :param logFile: directory of a txt file
    :return: None

    Yuncong Ma, 10/2/2023
    
    Scan based sampling added on Dec 13, 20204 
    """

    if logFile is not None:
        if isinstance(logFile, str):
            logFile = open(logFile, 'a')
        print(f'\nStart preparing bootstrapped scan list files '+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))+'\n', file=logFile, flush=True)

    # Lists for input
    list_scan = np.array([line.replace('\n', '') for line in open(file_scan, 'r')])
    scan_ID_unique = np.unique(list_scan)
    N_Scan = scan_ID_unique.shape[0]
        
    list_subject_ID = np.array([line.replace('\n', '') for line in open(file_subject_ID, 'r')])
    subject_ID_unique = np.unique(list_subject_ID)
    N_Subject = subject_ID_unique.shape[0]
    list_subject_folder = np.array([line.replace('\n', '') for line in open(file_subject_folder, 'r')])
    if file_group_ID is not None:
        list_group_D = ''
    else:
        list_group_D = None
    if list_group_D is not None:
        group_unique = np.unique(list_subject_ID)

    # check parameter
    if samplingMethod == 'Group_Subject' and (list_group_D is None or len(list_group_D) == 0):
        raise ValueError('Group information is absent')
    if samplingMethod != 'Subject' and samplingMethod != 'Group_Subject' and samplingMethod != 'Scan':
        raise ValueError('Unknown sampling method for bootstrapping: ' + samplingMethod)
    for i in range(1, nBS+1):
        if not os.path.exists(os.path.join(dir_output, str(i))):
            os.mkdir(os.path.join(dir_output, str(i)))
        List_BS = np.empty(sampleSize, dtype=list)

        # Randomly select subjects
        if samplingMethod == 'Subject':
            if sampleSize > N_Subject: # changed on 08/01/2024
                sampleSize = N_Subject
                #raise ValueError('The number of randomly selected subjects should be no more than the total number of subjects')
            ps = np.sort(np.random.choice(N_Subject, sampleSize, replace=False))
            for j in range(sampleSize):
                if combineScan == 1:
                    # Get all scans from the selected subject
                    temp = list_scan[np.where(np.compare_chararrays(list_subject_ID, subject_ID_unique[ps[j]], '==', False))[0]]
                    List_BS[j] = str.join('\n', temp)
                else:
                    # Choose one scan from the selected subject
                    temp = list_scan[np.where(np.compare_chararrays(list_subject_ID, subject_ID_unique[ps[j]], '==', False))[0]]
                    ps2 = np.random.choice(temp.shape[0], 1)  # list
                    List_BS[j] = temp[ps2[0]]  # transform to string list

        if samplingMethod == 'Group_Subject':
            break
        
        if samplingMethod == 'Scan':
            ps = np.sort(np.random.choice(N_Scan, sampleSize, replace=False))
            for j in range(sampleSize):
                # Choose one scan from the selected subject
                List_BS[j] = list_scan[ps[j]]  # transform to string list           

        # Write the Scan_List.txt file
        if logFile is not None:
            print('\nWrite bootstrapped scan list in file ' + os.path.join(dir_output, str(i), 'Scan_List.txt'), file=logFile)
        FID = open(os.path.join(dir_output, str(i), 'Scan_List.txt'), 'w')
        for j in range(sampleSize):
            print(List_BS[j], file=FID)
        FID.close()


[docs]def setup_pFN_folder(dir_pnet_result: str):
    """
    setup_pFN_folder(dir_pnet_result: str)
    Setup sub-folders in Personalized_FN to

    :param dir_pnet_result: directory of the pNet result folder
    :return: list_subject_folder_unique: unique subject folder array for getting sub-folders in Personalized_FN

    Yuncong Ma, 9/25/2023
    """

    # get directories of sub-folders
    dir_pnet_dataInput, dir_pnet_FNC, _, dir_pnet_pFN, _, _ = setup_result_folder(dir_pnet_result)

    # load settings for data input and FN computation
    if not os.path.isfile(os.path.join(dir_pnet_FNC, 'Setting.json')):
        raise ValueError('Cannot find the setting json file in folder FN_Computation')
    setting = load_json_setting(os.path.join(dir_pnet_FNC, 'Setting.json'))

    combineScan = setting['Combine_Scan']

    file_scan = os.path.join(dir_pnet_dataInput, 'Scan_List.txt')
    file_subject_folder = os.path.join(dir_pnet_dataInput, 'Subject_Folder.txt')

    list_scan = [line.replace('\n', '') for line in open(file_scan, 'r')]
    list_subject_folder = [line.replace('\n', '') for line in open(file_subject_folder, 'r')]
    list_subject_folder = np.array(list_subject_folder)
    list_subject_folder_unique = np.unique(list_subject_folder)

    # Check consistency of setting and files
    if len(list_subject_folder) != len(list_scan):
        raise ValueError('The length of contents in Scan_List.txt and Subject_Folder.txt does NOT match')
    if combineScan and len(list_subject_folder_unique) == len(list_subject_folder):
        raise ValueError('When combineScan is enabled, the txt file Subject_Folder.txt is supposed to show repeated sub-folder names')

    N_Scan = list_subject_folder_unique.shape[0]
    for i in range(N_Scan):
        template = list_subject_folder_unique[i]
        # find scan indexes that match to the subject folder
        scan_index = [i for i, x in enumerate(list_subject_folder) if x == template]
        dir_pnet_pFN_indv = os.path.join(dir_pnet_pFN, template)
        if not os.path.exists(dir_pnet_pFN_indv):
            os.makedirs(dir_pnet_pFN_indv)
        file_scan_ind = os.path.join(dir_pnet_pFN_indv, 'Scan_List.txt')
        file_scan_ind = open(file_scan_ind, 'w')
        for j in range(len(scan_index)):
            print(list_scan[scan_index[j]], file=file_scan_ind) #edited by hm, before edit: print(list_scan[scan_index[j]] + '\n', file=file_scan_ind)
        file_scan_ind.close()

    return list_subject_folder_unique


[docs]def run_FN_Computation(dir_pnet_result: str):
    """
    run_FN_Computation(dir_pnet_result: str)
    run the FN Computation module with settings ready in Data_Input and FN_Computation

    :param dir_pnet_result: directory of pNet result folder

    Yuncong Ma, 1/11/2024
    """

    # get directories of sub-folders
    dir_pnet_dataInput, dir_pnet_FNC, dir_pnet_gFN, dir_pnet_pFN, _, _ = setup_result_folder(dir_pnet_result)

    # log file
    logFile_FNC = os.path.join(dir_pnet_FNC, 'log.log')
    logFile_FNC = open(logFile_FNC, 'w')
    print('\nStart FN computation using Numpy at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + '\n',
          file=logFile_FNC, flush=True)

    # load settings for data input and FN computation
    if not os.path.isfile(os.path.join(dir_pnet_dataInput, 'Setting.json')):
        raise ValueError('Cannot find the setting json file in folder Data_Input')
    if not os.path.isfile(os.path.join(dir_pnet_FNC, 'Setting.json')):
        raise ValueError('Cannot find the setting json file in folder FN_Computation')
    settingDataInput = load_json_setting(os.path.join(dir_pnet_dataInput, 'Setting.json'))
    settingFNC = load_json_setting(os.path.join(dir_pnet_FNC, 'Setting.json'))
    setting = {'Data_Input': settingDataInput, 'FN_Computation': settingFNC}
    print('Settings are loaded from folder Data_Input and FN_Computation', file=logFile_FNC, flush=True)

    # load basic settings
    dataType = setting['Data_Input']['Data_Type']
    dataFormat = setting['Data_Input']['Data_Format']

    # load Brain Template
    Brain_Template = load_brain_template(os.path.join(dir_pnet_dataInput, 'Brain_Template.json.zip'))

    if dataType == 'Volume':
        Brain_Mask = Brain_Template['Brain_Mask']
    else:
        Brain_Mask = None
    print('Brain template is loaded from folder Data_Input', file=logFile_FNC, flush=True)

    # ============== gFN Computation ============== #
    # Start computation using SP-NMF
    if setting['FN_Computation']['Method'] == 'SR-NMF':
        print('FN computation uses sparsity-regularized non-negative matrix factorization method', file=logFile_FNC, flush=True)

        # Generate additional parameters
        gNb = compute_gNb(Brain_Template)
        scipy.io.savemat(os.path.join(dir_pnet_FNC, 'gNb.mat'), {'gNb': gNb}, do_compression=True)

        if setting['FN_Computation']['Group_FN']['file_gFN'] is None:
            # 2 steps
            # step 1 ============== bootstrap
            # sub-folder in FNC for storing bootstrapped results
            print('Start to prepare bootstrap files', file=logFile_FNC, flush=True)
            dir_pnet_BS = os.path.join(dir_pnet_FNC, 'BootStrapping')
            if not os.path.exists(dir_pnet_BS):
                os.makedirs(dir_pnet_BS)
            # Log
            logFile = os.path.join(dir_pnet_BS, 'Log.log')

            # Input files
            file_scan = os.path.join(dir_pnet_dataInput, 'Scan_List.txt')
            file_subject_ID = os.path.join(dir_pnet_dataInput, 'Subject_ID.txt')
            file_subject_folder = os.path.join(dir_pnet_dataInput, 'Subject_Folder.txt')
            file_group_ID = os.path.join(dir_pnet_dataInput, 'Group_ID.txt')
            if not os.path.exists(file_group_ID):
                file_group = None
            # Parameters
            combineScan = setting['FN_Computation']['Combine_Scan']
            init = setting['FN_Computation']['Group_FN']['BootStrap']['init']  # added on 08/03/2024
            samplingMethod = setting['FN_Computation']['Group_FN']['BootStrap']['samplingMethod']
            sampleSize = setting['FN_Computation']['Group_FN']['BootStrap']['sampleSize']
            nBS = setting['FN_Computation']['Group_FN']['BootStrap']['nBS']
            nTPoints = setting['FN_Computation']['Group_FN']['BootStrap']['nTPoints']  #added on 08/01/2024

            # create scan lists for bootstrap
            bootstrap_scan(dir_pnet_BS, file_scan, file_subject_ID, file_subject_folder,
                           file_group_ID=file_group_ID, combineScan=combineScan,
                           samplingMethod=samplingMethod, sampleSize=sampleSize, nBS=nBS, logFile=logFile)

            # Parameters
            K = setting['FN_Computation']['K']
            maxIter = setting['FN_Computation']['Group_FN']['maxIter']
            minIter = setting['FN_Computation']['Group_FN']['minIter']
            error = setting['FN_Computation']['Group_FN']['error']
            normW = setting['FN_Computation']['Group_FN']['normW']
            Alpha = setting['FN_Computation']['Group_FN']['Alpha']
            Beta = setting['FN_Computation']['Group_FN']['Beta']
            alphaS = setting['FN_Computation']['Group_FN']['alphaS']
            alphaL = setting['FN_Computation']['Group_FN']['alphaL']
            vxI = setting['FN_Computation']['Group_FN']['vxI']
            ard = setting['FN_Computation']['Group_FN']['ard']
            eta = setting['FN_Computation']['Group_FN']['eta']
            nRepeat = setting['FN_Computation']['Group_FN']['nRepeat']
            dataPrecision = setting['FN_Computation']['Computation']['dataPrecision']

            # separate maxIter and minIter for gFN and pFN
            if isinstance(maxIter, int) or (isinstance(maxIter, numpy.ndarray) and maxIter.shape == 1):
                maxIter_gFN = maxIter
                maxIter_pFN = maxIter
            else:
                maxIter_gFN = maxIter[0]
                maxIter_pFN = maxIter[1]
            if isinstance(minIter, int) or (isinstance(minIter, numpy.ndarray) and minIter.shape == 1):
                minIter_gFN = minIter
                minIter_pFN = minIter
            else:
                minIter_gFN = minIter[0]
                minIter_pFN = minIter[1]

            # NMF on bootstrapped subsets
            print('Start to NMF for each bootstrap at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), file=logFile_FNC, flush=True)
            for rep in range(1, 1+nBS):
                # log file
                logFile = os.path.join(dir_pnet_BS, str(rep), 'Log.log')
                # load data
                file_scan_list = os.path.join(dir_pnet_BS, str(rep), 'Scan_List.txt')
                Data,CHeader,NHeader = load_fmri_scan(file_scan_list, dataType=dataType, dataFormat=dataFormat, nTPoints=nTPoints, Reshape=True, Brain_Mask=Brain_Mask,
                                      Normalization='vp-vmax', logFile=logFile)
                # perform NMF
                FN_BS = gFN_NMF(Data, K, gNb, init=init, maxIter=maxIter_gFN, minIter=minIter_gFN, error=error, normW=normW,
                                Alpha=Alpha, Beta=Beta, alphaS=alphaS, alphaL=alphaL, vxI=vxI, ard=ard, eta=eta,
                                nRepeat=nRepeat, dataPrecision=dataPrecision, logFile=logFile)
                # save results
                FN_BS = reshape_FN(FN_BS, dataType=dataType, Brain_Mask=Brain_Mask)
                sio.savemat(os.path.join(dir_pnet_BS, str(rep), 'FN.mat'), {"FN": FN_BS}, do_compression=True)
                # save FNs in nii.gz and TC as txt file  FY 07/26/2024
                output_FN(FN=FN_BS, 
                          file_output=os.path.join(dir_pnet_BS, str(rep), 'FN.mat'), 
                          file_brain_template = Brain_Template, 
                          dataFormat=dataFormat,
                          Cheader = CHeader,
                          Nheader = NHeader)

            # step 2 ============== fuse results
            # Generate gFNs
            print('Start to fuse bootstrapped results using NCut at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), file=logFile_FNC, flush=True)
            FN_BS = np.empty(nBS, dtype=np.ndarray)
            # load bootstrapped results
            for rep in range(1, nBS+1):
                FN_BS[rep-1] = np.array(reshape_FN(load_matlab_single_array(os.path.join(dir_pnet_BS, str(rep), 'FN.mat')), dataType=dataType, Brain_Mask=Brain_Mask))
            gFN_BS = np.concatenate(FN_BS, axis=1)
            # log
            logFile = os.path.join(dir_pnet_gFN, 'Log.log')
            # Fuse bootstrapped results
            gFN = gFN_fusion_NCut(gFN_BS, K, logFile=logFile)
            # output
            gFN = reshape_FN(gFN, dataType=dataType, Brain_Mask=Brain_Mask)
            sio.savemat(os.path.join(dir_pnet_gFN, 'FN.mat'), {"FN": gFN}, do_compression=True)
            # save FNs in nii.gz and TC as txt file  FY 07/26/2024
            output_FN(FN=gFN,
                      file_output=os.path.join(dir_pnet_gFN, 'FN.mat'),
                      file_brain_template = Brain_Template,
                      dataFormat=dataFormat,
                      Cheader = CHeader,
                      Nheader = NHeader)

        else:  # use precomputed gFNs
            file_gFN = setting['FN_Computation']['Group_FN']['file_gFN']
            gFN = load_matlab_single_array(file_gFN)
            if dataType == 'Volume':
                Brain_Mask = load_brain_template(os.path.join(dir_pnet_dataInput, 'Brain_Template.json.zip'))['Brain_Mask']
                gFN = reshape_FN(gFN, dataType=dataType, Brain_Mask=Brain_Mask)
            check_gFN(gFN, method=setting['FN_Computation']['Method'])
            if dataType == 'Volume':
                gFN = reshape_FN(gFN, dataType=dataType, Brain_Mask=Brain_Mask)
            sio.savemat(os.path.join(dir_pnet_gFN, 'FN.mat'), {"FN": gFN}, do_compression=True)
            # save FNs in nii.gz and TC as txt file  FY 07/26/2024
            #output_FN(FN=gFN,
            #          file_output=os.path.join(dir_pnet_gFN, 'FN.mat'),
            #          file_brain_template = Brain_Template,
            #          dataFormat=dataFormat, Cheader = CHeader, Nheader = NHeader)

            print('load precomputed gFNs', file=logFile_FNC, flush=True)
        # ============================================= #

        # ============== pFN Computation ============== #
        print('Start to compute pFNs at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), file=logFile_FNC, flush=True)
        # load precomputed gFNs
        gFN = load_matlab_single_array(os.path.join(dir_pnet_gFN, 'FN.mat'))
        # additional parameter
        gNb = load_matlab_single_array(os.path.join(dir_pnet_FNC, 'gNb.mat'))
        # reshape to 2D if required
        gFN = reshape_FN(gFN, dataType=dataType, Brain_Mask=Brain_Mask)
        # setup folders in Personalized_FN
        list_subject_folder = setup_pFN_folder(dir_pnet_result)
        N_Scan = len(list_subject_folder)
        for i in range(1, N_Scan+1):
            print(f'Start to compute pFNs for {i}-th folder: {list_subject_folder[i-1]}', file=logFile_FNC, flush=True)
            dir_pnet_pFN_indv = os.path.join(dir_pnet_pFN, list_subject_folder[i-1])
            # parameter
            maxIter = setting['FN_Computation']['Personalized_FN']['maxIter']
            minIter = setting['FN_Computation']['Personalized_FN']['minIter']
            meanFitRatio = setting['FN_Computation']['Personalized_FN']['meanFitRatio']
            error = setting['FN_Computation']['Personalized_FN']['error']
            normW = setting['FN_Computation']['Personalized_FN']['normW']
            Alpha = setting['FN_Computation']['Personalized_FN']['Alpha']
            Beta = setting['FN_Computation']['Personalized_FN']['Beta']
            alphaS = setting['FN_Computation']['Personalized_FN']['alphaS']
            alphaL = setting['FN_Computation']['Personalized_FN']['alphaL']
            vxI = setting['FN_Computation']['Personalized_FN']['vxI']
            ard = setting['FN_Computation']['Personalized_FN']['ard']
            eta = setting['FN_Computation']['Personalized_FN']['eta']
            dataPrecision = setting['FN_Computation']['Computation']['dataPrecision']

            # separate maxIter and minIter for gFN and pFN
            if isinstance(maxIter, int) or (isinstance(maxIter, numpy.ndarray) and maxIter.shape == 1):
                maxIter_gFN = maxIter
                maxIter_pFN = maxIter
            else:
                maxIter_gFN = maxIter[0]
                maxIter_pFN = maxIter[1]
            if isinstance(minIter, int) or (isinstance(minIter, numpy.ndarray) and minIter.shape == 1):
                minIter_gFN = minIter
                minIter_pFN = minIter
            else:
                minIter_gFN = minIter[0]
                minIter_pFN = minIter[1]

            # log file
            logFile = os.path.join(dir_pnet_pFN_indv, 'Log.log')
            # load data
            Data, CHeader, NHeader = load_fmri_scan(os.path.join(dir_pnet_pFN_indv, 'Scan_List.txt'),
                                  dataType=dataType, dataFormat=dataFormat,
                                  Reshape=True, Brain_Mask=Brain_Mask, logFile=logFile)
            # perform NMF
            TC, pFN = pFN_NMF(Data, gFN, gNb, maxIter=maxIter_pFN, minIter=minIter_pFN, meanFitRatio=meanFitRatio, error=error, normW=normW,
                              Alpha=Alpha, Beta=Beta, alphaS=alphaS, alphaL=alphaL, vxI=vxI, ard=ard, eta=eta,
                              dataPrecision=dataPrecision, logFile=logFile)
            # output
            # pFN = reshape_FN(pFN.numpy(), dataType=dataType, Brain_Mask=Brain_Mask)  updated on 08/02/2024 removed .numpy()
            pFN = reshape_FN(pFN, dataType=dataType, Brain_Mask=Brain_Mask)
            sio.savemat(os.path.join(dir_pnet_pFN_indv, 'FN.mat'), {"FN": pFN}, do_compression=True)
            sio.savemat(os.path.join(dir_pnet_pFN_indv, 'TC.mat'), {"TC": TC}, do_compression=True)
            # save FNs in nii.gz and TC as txt file  FY 07/26/2024
            output_FN(FN=pFN,
                      file_output=os.path.join(dir_pnet_pFN_indv, 'FN.mat'),
                      file_brain_template = Brain_Template,
                      dataFormat=dataFormat, 
                      Cheader = CHeader,
                      Nheader = NHeader)
            np.savetxt(os.path.join(dir_pnet_pFN_indv, 'TC.txt'), TC)

        # ============================================= #

    print('Finished FN computation at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), file=logFile_FNC, flush=True)


[docs]def check_gFN(gFN: np.ndarray, method='SR-NMF', logFile=None):
    """
    Check the values in gFNs to ensure compatibility to the desired FN model

    :param gFN: group level FNs, 2D matrix for surface type [V K], 4D matrix for volume type [X Y Z K], where K is the number of FNs
    :param method: 'SR-NMF'
    :param logFile: directory of the log file

    Yuncong Ma, 9/27/2023
    """

    if method == 'SR-NMF':
        if np.sum(gFN < 0) > 0:
            raise ValueError('When using method SR-NMF, all values in gFNs should be non-negative')
            if logFile is not None:
                logFile = open(logFile, 'a')
                print('When using method SR-NMF, all values in gFNs should be non-negative', file=logFile, flush=True)