// NewSquareProduct creates a new SquareProduct func NewSquareProduct() *SquareProduct { return &SquareProduct{*spn.NewProduct()} }
// Gens Learning Algorithm // Based on the article // Learning the Structure of Sum Product Networks // Robert Gens and Pedro Domingos // International Conference on Machine Learning 30 (ICML 2013) func Gens(sc map[int]Variable, data []map[int]int, kclusters int, pval, eps float64, mp int) spn.SPN { n := len(sc) sys.Printf("Sample size: %d, scope size: %d\n", len(data), n) // If the data's scope is unary, then we return a leaf (i.e. a univariate distribution). if n == 1 { sys.Println("Creating new leaf...") // m number of instantiations. m := len(data) // pr is the univariate probability distribution. var tv *Variable for _, v := range sc { tv = &v } counts := make([]int, tv.Categories) for i := 0; i < m; i++ { counts[data[i][tv.Varid]]++ } leaf := spn.NewGaussian(tv.Varid, counts) //sys.Println("Leaf created.") return leaf } // Else we check for independent subsets of variables. We separate variables in k partitions, // where every partition is pairwise indepedent with each other. //sys.Println("Preparing to create new product node...") sys.Println("Creating VarDatas for Independency Test...") vdata, l := make([]*utils.VarData, n), 0 for _, v := range sc { tn := len(data) // tdata is the transpose of data[k]. tdata := make([]int, tn) for j := 0; j < tn; j++ { tdata[j] = data[j][v.Varid] } vdata[l] = utils.NewVarData(v.Varid, v.Categories, tdata) l++ } sys.Println("Creating new Independency graph...") // Independency graph. igraph := indep.NewUFIndepGraph(vdata, pval) vdata = nil // If true, then we can partition the set of variables in data into independent subsets. This // means we can create a product node (since product nodes' children have disjoint scopes). if len(igraph.Kset) > 1 { sys.Println("Found independency. Separating independent sets.") //sys.Println("Found independency between variables. Creating new product node...") // prod is the new product node. m is the number of disjoint sets. kset is a shortcut. prod, m, kset := spn.NewProduct(), len(igraph.Kset), &igraph.Kset tn := len(data) for i := 0; i < m; i++ { // Data slices of the relevant vectors. tdata := make([]map[int]int, tn) // Number of variables in set of variables kset[i]. s := len((*kset)[i]) for j := 0; j < tn; j++ { tdata[j] = make(map[int]int) for l := 0; l < s; l++ { // Get the instanciations of variables in kset[i]. //sys.Printf("[%d][%d] => %v vs %v | %v vs %v\n", j, k, (*kset)[i][k], len(data[j]), len(tdata[j]), k) k := (*kset)[i][l] tdata[j][k] = data[j][k] } } // Create new scope with new variables. nsc := make(map[int]Variable) for j := 0; j < s; j++ { t := (*kset)[i][j] nsc[t] = Variable{t, sc[t].Categories} } //sys.Printf("LENGTH: %d\n", len(tdata)) //sys.Println("Product node created. Recursing...") // Adds the recursive calls as children of this new product node. prod.AddChild(Gens(nsc, tdata, kclusters, pval, eps, mp)) } return prod } igraph = nil // Else we perform k-clustering on the instances. sys.Println("No independency found. Preparing for clustering...") m := len(data) mdata := make([][]int, m) for i := 0; i < m; i++ { lc := len(data[i]) mdata[i] = make([]int, lc) l := 0 keys := make([]int, lc) for k := range data[i] { keys[l] = k l++ } sort.Ints(keys) for j := 0; j < lc; j++ { mdata[i][j] = data[i][keys[j]] } } var clusters []map[int][]int if kclusters > 0 { sys.Printf("data: %d, mdata: %d\n", len(data), len(mdata)) if len(mdata) < kclusters { //Fully factorized form. //All instances are approximately the same. prod := spn.NewProduct() m := len(data) for _, v := range sc { counts := make([]int, v.Categories) for i := 0; i < m; i++ { counts[data[i][v.Varid]]++ } leaf := spn.NewGaussian(v.Varid, counts) prod.AddChild(leaf) } return prod } clusters = cluster.KMeansV(kclusters, mdata) } else if kclusters == -1 { clusters = cluster.DBSCAN(mdata, eps, mp) } else { clusters = cluster.OPTICS(mdata, eps, mp) } k := len(clusters) //sys.Printf("Clustering similar instances with %d clusters.\n", k) if k == 1 { // Fully factorized form. // All instances are approximately the same. prod := spn.NewProduct() m := len(data) for _, v := range sc { counts := make([]int, v.Categories) for i := 0; i < m; i++ { counts[data[i][v.Varid]]++ } leaf := spn.NewGaussian(v.Varid, counts) counts = nil prod.AddChild(leaf) } return prod } mdata = nil sys.Println("Reformating clusters to appropriate format and creating sum node...") sum := spn.NewSum() for i := 0; i < k; i++ { ni := len(clusters[i]) ndata := make([]map[int]int, ni) l := 0 for k := range clusters[i] { ndata[l] = make(map[int]int) for index, value := range data[k] { ndata[l][index] = value } l++ } nsc := make(map[int]Variable) for k, v := range sc { nsc[k] = v } //sys.Println("Created new sum node child. Recursing...") sum.AddChildW(Gens(nsc, ndata, kclusters, pval, eps, mp), float64(ni)/float64(len(data))) } clusters = nil return sum }