/* Author Chung Chan Copyright (C) 2021 Chung Chan */ load(descriptive)$ load(draw)$ /**************** helper functions ****************/ /* Simplify an expression. */ simplify(ex) := ev(fullratsimp(ex), simp)$ /* Log base 2. */ log2(x):=log(x)/log(2)$ /* Characteristic function. */ chi(p):=(if p then 1 else 0)$ /******************************** Data construction/transformation ********************************/ /* Create data with feature names fns and a list lst of instances (list of feature values). */ build_data_from_list(fns, lst):=apply('matrix, cons(fns,lst))$ /* Create data with feature names fns and a matrix M with rows as the instances. */ build_data_from_matrix(fns, M):=build_data_from_list(fns, args(M))$ /* Generate data with feature names fns by generating the instances one-by-one by calling the function gen n times, each with an unique index as its argument from 1 to n. */ build_data(fns, gen, n):=block( [lst:[]], for i:1 thru n do lst:endcons( block( [v:gen(i)], for j:1 thru length(fns) do v:subst(fns[j]=v[j], v), map(simplify, v) ), lst ), build_data_from_list(fns, lst) )$ /* Return the column index of the feature fn in the list fns of feature names. The returned value is false if fn is not in fns. */ feature_index(fns, fn):=block( [i:1], for x in fns unless x=fn do i:i+1, if i<=length(fns) then i )$ /* Return the size (number of instances) of the data. */ size(data):=length(data)-1$ /* Return the i-th instance of data as a list of feature values. */ get_data(data, i):=args(data)[i+1]$ /* Return the feature names of the data. */ feature_names(data):=args(data)[1]$ /* Return the matrix of all feature values. */ all_feature_values(data):=submatrix(1,data)$ /* Return a list of feature values in data for the feature fn. */ feature_values(data, fn):=block( [i:feature_index(feature_names(data), fn)], rest(args(transpose(col(data,i)))[1],1) )$ /* Subsample data satisfying the condition cond, which is a boolean function that takes a unique index (1, 2, ...) as the argument. */ subsample_data(data, cond):=block( [ fns:feature_names(data), fvs: args(all_feature_values(data)), lst:[] ], for i:1 thru size(data) do block( [ r: fvs[i], c:cond(i) ], if simplify(psubst(map("=",fns,r),c)) then lst:endcons(r,lst) ), build_data_from_list(fns,lst) )$ /* Transform the features of data to nfns using the generator ngen. */ transform_features(data, nfns, ngen):=block( [ lst:[], fns:feature_names(data), fvs:args(all_feature_values(data)) ], for i:1 thru size(data) do lst:endcons ( block( [ v:ngen(i), r:fvs[i] ], v:psubst(map("=", fns, r), v), for j:1 thru length(nfns) do v:subst(nfns[j]=v[j], v), map(simplify,v) ), lst ), build_data_from_list(nfns,lst) )$ /* Stack the list lst of data (vertically). */ stack_data([lst]):=build_data_from_list( feature_names(lst[1]), apply('append,map(lambda([D], args(all_feature_values(D))), lst)) )$ /* Plot labeled data with x-axis xy[1] and y-axis xy[2]. */ plot_labeled_data(data,xy,target):= block( [ Ds:split_data(data, target), D, lst:[] ], for i:1 thru length(Ds[2]) do ( D:transform_features(Ds[2][i], xy, subst('xy=xy, lambda([i], xy))), lst:append(lst,[color=i, points(args(all_feature_values(D)))]) ), draw2d(lst,xlabel=xy[1],ylabel=xy[2]) )$ /* Split data according to a splitting criterion X */ split_data(data, X):=block( [D, Djs:[]], D:transform_features(data, ['C], subst('X=X,lambda([i], [X]))), t:empirical(feature_values(D, 'C)), for j:1 thru length(t[1]) do block([Dj], Dj:subsample_data( data, subst( ['c=X, 'v=t[1]], lambda([i], equal(c,v[j])) ) ), Djs:endcons(Dj, Djs) ), [t,Djs] )$ /* Compute the empirical distribution of the list lst. A pair is returned, where - the first element is the list of unique values sorted in ascending order, and - the second element is their fractional number of occurences. */ empirical(lst):=block( [ s:sort(lst), n:length(lst), c:1, us:[], fs:[] ], for i:1 thru n do ( if i1 /* recur on tail maxima (tm) */ then block( [tm :apply('maxima,rest(lst))], if lst[1]>=tm[2] then maxima(lst[1]) else [tm[1]+1,tm[2]] ) /* base cases */ else if length(lst)>0 then [1, lst[1]] else [0, -inf]$ /* trailing $ gives no output. */ /* Return the list of majority values. */ majorities(data, fn):=block( [ t: empirical(feature_values(data, fn)), mf,lst:[] ], mf: lmax(t[2]), for i:1 thru length(t[1]) do if equal(t[2][i], mf) then lst: endcons(t[1][i], lst), lst )$ /* Given the lists of class values and predictions for different instances, return the list of evaluation metrics: [TP, FN, FP, TN, accuracy, precision, recall, specificity] */ clf_performance_metrics(class,pred):= block( [ TP:0, FN:0, FP:0, TN:0, n: length(class) ], for i:1 thru n do if class[i]>0 then if pred[i]>0 then TP:TP+1 else FN:FN+1 else if pred[i]>0 then FP:FP+1 else TN:TN+1, accuracy:(TP+TN)/(TP+TN+FP+FN), precision:TP/(TP+FP), recall: TP/(TP+FN), specificity: TN/(TN+FP), [TP, FN, FP, TN, accuracy, precision, recall, specificity] )$ /***************** Impurity measures *****************/ /* Entropy */ entropy(ps):=lsum(if equal(p,0) then 0 else -p*log2(p), p, ps)$ h([ps]):=entropy(ps)$ /* Information content of target */ Info(data, target):=block( [ps:empirical(feature_values(data, target))[2]], entropy(ps) )$ /* Conditional information content of target given a splitting criterion X */ InfoX(data, target, X):=block( [info:0, t, ps, Ds], t:split_data(data, X), ps:t[1][2], Ds:t[2], for j:1 thru length(Ds) do if size(Ds[j])>0 then ( info:info+ps[j]*entropy( empirical(feature_values(Ds[j],target))[2] ) ), info )$ /* Information gain of target with respect to a splitting criterion X. */ Gain(data, target, X):=Info(data, target) - InfoX(data, target, X)$ /* Split information of a splitting criterion */ SplitInfo(data, X):=block( [D:transform_features(data, ['C], subst('X=X, lambda([i], [X])))], t:empirical(feature_values(D, 'C)), entropy(t[2]) )$ /* Information gain ratio of target with respect to a splitting criterion X. */ GainRatio(data, target, X):=Gain(data, target, X)/SplitInfo(data, X)$ /* Gini impurity index */ gini(ps):=lsum(p*(1-p), p, ps)$ g([ps]):=gini(ps)$ /* Gini impurity of target */ Gini(data, target):=block( [t:empirical(feature_values(data, target))], gini(t[2]) )$ /* Conditional Gini impurity of target given a splitting criterion X. */ GiniX(data, target, X):=block( [_gini:0, t, ps, Ds], t:split_data(data, X), ps:t[1][2], Ds:t[2], for j:1 thru length(Ds) do if size(Ds[j])>0 then _gini:_gini+ps[j]*gini( empirical(feature_values(Ds[j], target))[2] ), _gini )$ /* Drop in Gini impurity of target with respect to a splitting criterion X. */ GiniDrop(data, target, X):=Gini(data, target) - GiniX(data, target, X)$ /************************* Rule-based classification *************************/ /* Helper functions for calculating FOIL gain and prune. */ foilgain(p_0,n_0,p_1,n_1):=if equal(p_1, 0) then 0 else p_1*(log2(p_1/(p_1+n_1))-log2(p_0/(p_0+n_0)))$ foilprune(p,n):=if equal(p+n,0) then minf else (p-n)/(p+n)$ conjoin(conjts):=apply( "and", map(lambda([v], if op(v)="=" then apply(equal,args(v)) else v),conjts) )$ /* For the rules R_0: rest(cjts,-1) => Y=1 R_1: cjts => Y=1 where rest(cjts,-1) is the list of conjuncts in cjts except the last one, compute [p_0, n_0, p_1, n_1] where - p_i is the number of positive examples (with Y>0 in data) covered by R_i, and - n_i is the number of negative examples (with Y<0 in data) covered by R_i. */ pn_counts(data, Y, cjts):= block([data_0, cond_0, p_0, n_0, data_1, cond_1, p_1, n_1], cond_0: conjoin(rest(cjts, -1)), data_0: subsample_data(data, subst('cond_0=cond_0,lambda([i],cond_0))), p_0: size(subsample_data(data_0, subst('Y=Y,lambda([i],Y>0)))), n_0: size(data_0) - p_0, cond_1: conjoin(cjts), data_1: subsample_data(data, subst('cond_1=cond_1,lambda([i],cond_1))), p_1: size(subsample_data(data_1, subst('Y=Y,lambda([i],Y>0)))), n_1: size(data_1) - p_1, [p_0,n_0,p_1,n_1] )$ /* FOIL gain from rule R_0 to rule R_1 where R_0: rest(cjts,-1) => Y=1 R_1: cjts => Y=1 and rest(cjts,-1) is the list of conjuncts in cjts except the last one. */ FOILGain(data, Y, cjts):=apply('foilgain, pn_counts(data, Y, cjts))$ /* FOIL prune from rule R_1 to R_0 where R_0: rest(cjts,-1) => Y=1$ R_1: cjts => Y=1$ and rest(cjts,-1) is the list of conjuncts in cjts except the last one. */ FOILPrune(data, Y, cjts):=apply( lambda([p_0,n_0,p_1,n_1], [foilprune(p_1,n_1), foilprune(p_0,n_0)]), pn_counts(data, Y, cjts) )$ /********** Clustering **********/ /* Compute the squared Euclidean distance of two vectors p and q. */ sq_dist(p,q):=block( [ v ], if listp(p) then p:apply('matrix, [p]), if listp(q) then q:apply('matrix, [q]), v: p-q, v . v )$ /* Euclidean distance between two vectors/lists p and q. */ dist(p,q):=sqrt(sq_dist(p,q))$ /* Return a row index of the matrix C corresponding to the nearest neighbor of p. */ nearest_neighbor(p, Q):=block( [ i, cv, mv:inf ], if listp(p) then p:apply('matrix,[p]), if listp(Q) then Q:apply('matrix,Q), for j:1 thru length(Q) do ( cv:sq_dist(p,row(Q,j)), if (cv < mv) then ( i:j, mv:cv ) ), i )$ /* Return a list of row indices of matrix Q corresponding to the nearest neighbors of rows of matrix P. */ nearest_neighbors(P,Q):=block( if listp(P) then P:apply('matrix, P), if listp(Q) then Q:apply('matrix, Q), makelist(nearest_neighbor(row(P,i),Q),i,length(P)) )$ /* Return the cluster assignment that minimizes the distance to of the data points to the cluster centers in cs. */ clusters_for_centroids(data, cs):=block( [ cfns: feature_names(cs), P:all_feature_values( transform_features(data, cfns, subst('cfns=cfns,lambda([i], cfns)) ) ), Q:all_feature_values(cs) ], nearest_neighbors(P, Q) )$ /* Compute the total squared distance of the data points to their cluster centers cs according to the cluster assignment C. */ variation(data,C,cs):=block( [ P: all_feature_values(data), Q: all_feature_values(cs), var:0 ], for j:1 thru size(data) do var:var+sq_dist(row(P,j), row(Q,C[j])), var )$ /* Compute the centroid (as a row vector) of the rows of a non-empty input matrix P. P may also be a list of list. */ centroid(P):=block( [ n: length(P), u ], if listp(P) then P: apply('matrix, P), u: 1+zeromatrix(1, n), (u . P)/n )$ /* Compute the centroids of the list Cs of clusters using only features in cfns . The centroids are returned as data with features cfns. */ centroids(Cs, cfns):=block( [ cs:[], c ], for data in Cs do ( c:centroid( all_feature_values( transform_features(data, cfns, subst('cfns=cfns,lambda([i], cfns)) ) ) ), cs:endcons(args(c)[1],cs) ), build_data_from_list(cfns,cs) )$ /* Split data by the cluster assignment C, which is a list of cluster indices, one for each data point. */ split_data_by_clusters(data, C):=block( [Ds:[], t, Dj], t:empirical(C), for j:1 thru length(t[1]) do ( Dj:subsample_data( data, subst( ['C=C, 'v=t[1]], lambda([i], equal(C[i],v[j])) ) ), Ds:endcons(Dj, Ds) ), [t,Ds] )$ /* Compute the centroids computed on the features in cfns according to the cluster assignment C. The centroids are returned as data with features cfns, in the ascending order of the cluster indices. */ centroids_for_clusters(data, cfns, C):=block( [ t: split_data_by_clusters(data, C), nfns: endcons(C, cfns) ], centroids(t[2], cfns) )$ /* Cluster distances */ min_dist(P, Q):=block( [ md2:inf ], for i:1 thru length(P) do for j:1 thru length(Q) do md2: min(md2, sq_dist(row(P, i), row(Q, j))), sqrt(md2) )$ max_dist(P, Q):=block( [ md2:-inf ], for i:1 thru length(P) do for j:1 thru length(Q) do md2: max(md2, sq_dist(row(P, i), row(Q, j))), sqrt(md2) )$ centroid_dist(P, Q):=block( [ p: centroid(P), q: centroid(Q) ], sqrt(sq_dist(p, q)) )$ ward_dist(P, Q):=block( [ M: apply('matrix,append(args(P), args(Q))), c, d:0 ], c: centroid(M), for i:1 thru length(M) do d:d+sq_dist(row(M, i), c), d )$ /* Compute the sorted list of pairwise cluster distances for data with clustering assignment C and the distance measure dist. The returned list consists of elements [i,j,d] where d is the distance of Cluster i and Cluster j. */ pairwise_cluster_dists(data, C, dist):=block( [ t: split_data_by_clusters(data, C), ids, Cs, k, lst:[] ], ids: t[1][1], Cs: t[2], k: length(Cs), for i:1 thru k do for j:i+1 thru k do lst:endcons( [ [ids[i],ids[j]], apply(dist,[all_feature_values(Cs[i]),all_feature_values(Cs[j])]) ], lst ), sort(lst,lambda([a,b],a[2]1 then ( for ki:1 thru nC do ( nCi:length(Cs[ki]), for i:1 thru nCi do ( ds:[Cs[ki][i]], if nCi>1 then ( /* mean intra-cluster distance */ a:0, for j:1 thru nCi do if not equal(i,j) then ( /* print(Cs[ki][i],Cs[ki][j]), */ a:a+dist(row(P,Cs[ki][i]), row(P, Cs[ki][j])) ), a: a/(nCi-1), /* mean nearest-cluster distance */ b:inf, for kj:1 thru nC do if not equal(ki,kj) then ( /* mean distance to another cluster */ d:0, nCj:length(Cs[kj]), for j:1 thru nCj do ( /* print(Cs[ki][i],Cs[kj][j]), */ d:d+dist(row(P,Cs[ki][i]), row(P, Cs[kj][j])) ), d: d/nCj, if d < b then ( minj: t[1][kj], b:d ) ), /* silhouette coefficient */ s: (b-a)/max(a,b), ds:append(ds,[a,b,minj,s]) ) else ds:append(ds,[false, false, false, 1]), lst: endcons(ds, lst) ) ), build_data_from_list( ['a, 'b, "nearest", 's], map(rest, sort(lst, lambda([a,b], a[1]best[1] then best: [c,cons([rids[1],cds[j]],t[2])] ), best ) else ( best:[-inf,false], for i:1 thru m do ( t:lsa(U,delete(rids[i],rids),rest(cds)), c:U[rids[i]][cds[1]] + t[1], if c>best[1] then best: [c,cons([rids[i],cds[1]],t[2])] ), best ) ) )$ /* Carry the classes-to-clusters evaluation on the categorization L and cluster assignment C. The returned list consists of - the accuracy maximized over the classes-to-clusters assignment, - the assignment in the form of a list of [l,c] where l is a class index and c is a cluster index, and - the list consisting of - the list class of unique class labels in ascending order, - the list cluster of unique cluster labels in ascending order, and - the list lst of list of counts where lst[i][j] is the counts of instances associated with class index class[i] and cluster index cluster[j]. */ classes_to_clusters_eval(L, C):=block( [ tL:to_clusters(L), tC:to_clusters(C), Ls, Cs, lst:[], c, nL, nC, t ], nL:length(tL[1]), nC:length(tC[1]), Ls: map(setify, tL[2]), Cs: map(setify, tC[2]), for i:1 thru nL do ( c:[], for j:1 thru nC do c: endcons(length(intersection(Ls[i], Cs[j])), c), lst: endcons(c, lst) ), t:lsa(lst, makelist(i,i,nL), makelist(i,i,nC)), [ t[1]/length(C), map(lambda([ids],[tL[1][ids[1]],tC[1][ids[2]]]),t[2]), [tL,tC,lst] ] )$ /*********************** Association Rule Mining ***********************/ /* Return the support count of A in the transactional data. */ support_count(data, A):=lsum(chi(subsetp(A,T)),T,data)$ /* Return all the items in the transactional data. */ all_items(data):=listify(apply('union, data))$ /* Return a list of support counts corresponding to the list of itemsets in C. */ support_counts(data, C):=map(lambda([A],[A,support_count(data,A)]),C)$ /* Return the sublist of [A,c] from C where the frequent itemset A has counts c at least min_sup. */ frequent_itemsets(C, min_sup):=sublist(C,lambda([Ac], Ac[2]>=min_sup))$ /* Return the list of [A,c] where A is a frequent 1-itemset from data with count c at least min_sup. */ apriori1(data, min_sup):=block( [C:makelist({i},i,all_items(data))], C:support_counts(data, C), frequent_itemsets(C, min_sup) )$ /* Join step for apriori algorithm. It returns a list of candidate obtained by joining two frequent itemsets from L having all but the last items identical. */ apriori_join(data, L):=block( [C:[] ,m:length(L)], for i:1 thru m do for j:i+1 thru m do if equal(rest(L[i][1],-1),rest(L[j][1],-1)) then C: endcons(union(L[i][1],L[j][1]), C), C )$ /* Prune step for apriori algorithm. It prune the candidate list C of k-itemsets if any of its (k-1)-subsets is not in the set L of (k-1)-frequent item sets. */ apriori_prune(data, C, L):=sublist(C, lambda([A], block( [not_prune:true, Al], Al:listify(A), for i:1 thru length(L[1]) while not_prune do not_prune: elementp(disjoin(Al[i],A),L), not_prune ) ) )$ /* Generate the frequent k-itemsets given the frequent (k-1)-itemsets stored in L. */ apriorik(data, L, min_sup):=block( [C:[] ,m:length(L)], C: apriori_join(data, L), C: apriori_prune(data, C, setify(map(first,L))), C: support_counts(data, C), frequent_itemsets(C, min_sup) )$ /* Generate all frequent itemsets starting with k from 1 until there are no more frequent k-itemsets. */ apriori(data, min_sup):=block( [L: apriori1(data, min_sup), lst:[]], while length(L)>1 do ( lst: endcons(L, lst), L: apriorik(data, L, min_sup) ), lst )$ /* Creates an association rule A => B. */ ar(A,B):=matrix([A,"⇒",B])$ /* Returns the itemset associated with the antecedent of R. */ ar_A(R):=R[1,1]$ /* Returns the itemset associated with the consequence of R.*/ ar_B(R):=R[1,3]$ /* Computer various qualities of an association rule R from a transactional data set data. */ coverage(data, R):=support_count(data, ar_A(R))/length(data)$ support(data, R):=support_count(data, union(ar_A(R),ar_B(R)))/length(data)$ confidence(data, R):=support(data, R)/coverage(data, R)$ prior(data, R):=support_count(data, ar_B(R))/length(data)$ lift(data, R):=confidence(data, R)/prior(data, R)$ /* Generate association rules R for a transactional data set data with support at least s and confidence at least c. It returns a list of lists in the form: [ R, coverage(data, R), support(data, R), confidence(data, R), prior(data, R), lift(data, R) ] */ support_confidence_framework(data, s, c):= block( [ n: length(data), L, C, A, B, C_count, A_count, B_count, lst:[] ], L: apply('append, apriori(data, n*s)), for i:1 thru length(L) do ( C:L[i][1], C_count: L[i][2], for k:1 thru length(C)-1 do for A in powerset(C, k) do ( A_count : assoc(A, L), if A_count <= C_count/c then ( B: setdifference(C,A), B_count : assoc(B, L), lst:endcons( [ ar(A,B), A_count/n, C_count/n, C_count/A_count, B_count/n, C_count/A_count/B_count*n ], lst ) ) ) ), lst )$ /* Bottom-up construction of iceberg cube where - data is the base cuboid with - dims is the dimensions names, - fact is the name of the fact, and - min_sup is the minimum value of fact for the iceberg condition. */ BUC(data, dims, fact, min_sup):= block( [ out: [], fns: feature_names(data), _BUC: lambda( [input, ns_dims, dim], block( [ s:apply("+",feature_values(input, fact)), c, t ], if s>=min_sup then ( c: map( lambda([fn,v], if elementp(fn, ns_dims) then v else if equal(fn,fact) then s else "*" ), fns, args(all_feature_values(input))[1] ), out:endcons(c, out), for i:dim thru length(dims) do ( t: split_data(input, dims[i]), /* print(input, ns_dims, dims[i], c, t), */ for j:1 thru length(t[2]) do apply(_BUC, [t[2][j], adjoin(dims[i], ns_dims), i+1]) ) ) ) ) ], apply(_BUC, [data, {}, 1]), build_data_from_list(fns, out) )$