// NewIndepGraph constructs a new Graph given a DataGroup. func NewIndepGraph(data []*utils.VarData, pval float64) *Graph { igraph := Graph{make(map[int][]int), nil} n := len(data) // IDs and Reverse IDs. ids := make([]int, n) rids := make(map[int]int) for i := 0; i < n; i++ { ids[i] = data[i].Varid rids[ids[i]] = i igraph.adjlist[ids[i]] = []int{} } sys.Println("Constructing independence graph...") // Construct the indepedency graph by adding an edge if there exists a dependency relation. for i := 0; i < n; i++ { for j := i + 1; j < n; j++ { v1, v2 := ids[i], ids[j] // Initialize the count matrix mdata. //sys.Println("Initializing count matrix...") p, q := data[i].Categories, data[j].Categories mdata := make([][]int, p+1) for k := 0; k < p+1; k++ { mdata[k] = make([]int, q+1) } // len(data[i].Data) == len(data[j].Data) by definition. m := len(data[i].Data) for k := 0; k < m; k++ { mdata[data[i].Data[k]][data[j].Data[k]]++ } //sys.Println("Counting totals and assigning to edges...") // Total on the x axis, y axis and x+y respectively. tx, ty, tt := make([]int, q), 0, 0 //sys.Println("Y-axis...") for x := 0; x < p; x++ { ty = 0 for y := 0; y < q; y++ { ty += mdata[x][y] tx[y] += mdata[x][y] } mdata[x][q] = ty } // Compute total on the x axis. //sys.Println("X-axis...") for y := 0; y < q; y++ { mdata[p][y] = tx[y] tt += tx[y] } // Total total. mdata[p][q] = tt // Checks if variables i, j are independent. //sys.Println("Checking for pairwise independence...") indep := GTest(p, q, mdata, n*(n-1)/2, pval) //sys.Printf("%t\n", indep) // If not independent, then add an undirected edge i-j. if !indep { //sys.Println("Not independent. Creating edge...") igraph.adjlist[v1] = append(igraph.adjlist[v1], v2) igraph.adjlist[v2] = append(igraph.adjlist[v2], v1) } //else { //sys.Println("Independent. No edges.") //} } } // utils.Union-utils.Find to discriminate each set of connected variables that are fully disconnected of // another set of connected set of variables sys.Println("utils.Finding disconnected subgraphs...") // Set of utils.Union-utils.Find trees. sets := make([]*utils.UFNode, n) // At first every vertex has its own set. for i := 0; i < n; i++ { sets[i] = utils.MakeSet(ids[i]) } sys.Println("Preparing to test each vertex of the independence graph for disconnectivity...") // If a vertex u has an edge with another vertex v, then union sets that contain u and v. for i := 0; i < n; i++ { v1 := ids[i] m := len(igraph.adjlist[v1]) for j := 0; j < m; j++ { v2 := igraph.adjlist[v1][j] rv2 := rids[v2] if utils.Find(sets[i]) == utils.Find(sets[rv2]) { continue } utils.Union(sets[i], sets[rv2]) } } igraph.Kset = nil for i := 0; i < n; i++ { if sets[i] == sets[i].Pa { igraph.Kset = append(igraph.Kset, utils.UFVarids(sets[i])) } } return &igraph }
// Gens Learning Algorithm // Based on the article // Learning the Structure of Sum Product Networks // Robert Gens and Pedro Domingos // International Conference on Machine Learning 30 (ICML 2013) func Gens(sc map[int]Variable, data []map[int]int, kclusters int, pval, eps float64, mp int) spn.SPN { n := len(sc) sys.Printf("Sample size: %d, scope size: %d\n", len(data), n) // If the data's scope is unary, then we return a leaf (i.e. a univariate distribution). if n == 1 { sys.Println("Creating new leaf...") // m number of instantiations. m := len(data) // pr is the univariate probability distribution. var tv *Variable for _, v := range sc { tv = &v } counts := make([]int, tv.Categories) for i := 0; i < m; i++ { counts[data[i][tv.Varid]]++ } leaf := spn.NewGaussian(tv.Varid, counts) //sys.Println("Leaf created.") return leaf } // Else we check for independent subsets of variables. We separate variables in k partitions, // where every partition is pairwise indepedent with each other. //sys.Println("Preparing to create new product node...") sys.Println("Creating VarDatas for Independency Test...") vdata, l := make([]*utils.VarData, n), 0 for _, v := range sc { tn := len(data) // tdata is the transpose of data[k]. tdata := make([]int, tn) for j := 0; j < tn; j++ { tdata[j] = data[j][v.Varid] } vdata[l] = utils.NewVarData(v.Varid, v.Categories, tdata) l++ } sys.Println("Creating new Independency graph...") // Independency graph. igraph := indep.NewUFIndepGraph(vdata, pval) vdata = nil // If true, then we can partition the set of variables in data into independent subsets. This // means we can create a product node (since product nodes' children have disjoint scopes). if len(igraph.Kset) > 1 { sys.Println("Found independency. Separating independent sets.") //sys.Println("Found independency between variables. Creating new product node...") // prod is the new product node. m is the number of disjoint sets. kset is a shortcut. prod, m, kset := spn.NewProduct(), len(igraph.Kset), &igraph.Kset tn := len(data) for i := 0; i < m; i++ { // Data slices of the relevant vectors. tdata := make([]map[int]int, tn) // Number of variables in set of variables kset[i]. s := len((*kset)[i]) for j := 0; j < tn; j++ { tdata[j] = make(map[int]int) for l := 0; l < s; l++ { // Get the instanciations of variables in kset[i]. //sys.Printf("[%d][%d] => %v vs %v | %v vs %v\n", j, k, (*kset)[i][k], len(data[j]), len(tdata[j]), k) k := (*kset)[i][l] tdata[j][k] = data[j][k] } } // Create new scope with new variables. nsc := make(map[int]Variable) for j := 0; j < s; j++ { t := (*kset)[i][j] nsc[t] = Variable{t, sc[t].Categories} } //sys.Printf("LENGTH: %d\n", len(tdata)) //sys.Println("Product node created. Recursing...") // Adds the recursive calls as children of this new product node. prod.AddChild(Gens(nsc, tdata, kclusters, pval, eps, mp)) } return prod } igraph = nil // Else we perform k-clustering on the instances. sys.Println("No independency found. Preparing for clustering...") m := len(data) mdata := make([][]int, m) for i := 0; i < m; i++ { lc := len(data[i]) mdata[i] = make([]int, lc) l := 0 keys := make([]int, lc) for k := range data[i] { keys[l] = k l++ } sort.Ints(keys) for j := 0; j < lc; j++ { mdata[i][j] = data[i][keys[j]] } } var clusters []map[int][]int if kclusters > 0 { sys.Printf("data: %d, mdata: %d\n", len(data), len(mdata)) if len(mdata) < kclusters { //Fully factorized form. //All instances are approximately the same. prod := spn.NewProduct() m := len(data) for _, v := range sc { counts := make([]int, v.Categories) for i := 0; i < m; i++ { counts[data[i][v.Varid]]++ } leaf := spn.NewGaussian(v.Varid, counts) prod.AddChild(leaf) } return prod } clusters = cluster.KMeansV(kclusters, mdata) } else if kclusters == -1 { clusters = cluster.DBSCAN(mdata, eps, mp) } else { clusters = cluster.OPTICS(mdata, eps, mp) } k := len(clusters) //sys.Printf("Clustering similar instances with %d clusters.\n", k) if k == 1 { // Fully factorized form. // All instances are approximately the same. prod := spn.NewProduct() m := len(data) for _, v := range sc { counts := make([]int, v.Categories) for i := 0; i < m; i++ { counts[data[i][v.Varid]]++ } leaf := spn.NewGaussian(v.Varid, counts) counts = nil prod.AddChild(leaf) } return prod } mdata = nil sys.Println("Reformating clusters to appropriate format and creating sum node...") sum := spn.NewSum() for i := 0; i < k; i++ { ni := len(clusters[i]) ndata := make([]map[int]int, ni) l := 0 for k := range clusters[i] { ndata[l] = make(map[int]int) for index, value := range data[k] { ndata[l][index] = value } l++ } nsc := make(map[int]Variable) for k, v := range sc { nsc[k] = v } //sys.Println("Created new sum node child. Recursing...") sum.AddChildW(Gens(nsc, ndata, kclusters, pval, eps, mp), float64(ni)/float64(len(data))) } clusters = nil return sum }