func FindSimilarity_1(sources []toolkit.M) []toolkit.M { result := make([]toolkit.M, 0) pqs := pque.NewQue() pqs.WorkerCount = 100 pqs.Fn = func(in interface{}) interface{} { s := in.(toolkit.M) sname := s["Trimmed"].(string) fmt.Printf("Received job for %v - %s \n", s["_id"], s["Trimmed"]) for _, s1 := range sources { //-- do nothing s1name := s1["Trimmed"].(string) if s1name == sname { //-- do something } } return s } pqs.FnDone = func(in interface{}) { s := in.(toolkit.M) fmt.Printf("Complete job for %v - %s. %d of %d \n", s["_id"], s["Trimmed"], pqs.CompletedJob, pqs.PreparedJob) } pqs.WaitForKeys() for _, s := range sources { pqs.SendKey(s) } pqs.KeySendDone() pqs.WaitForCompletion() return result }
func main() { tStart := time.Now() defer DbCtx().Close() fmt.Println("Material Duplicate Detector") fmt.Println("v0.8") fmt.Println("") tPopulate := time.Now() fmt.Print("1. Populate master material ... ") mats := PopulateMaterial() fmt.Printf("Done (%v). %d records has been populated \n", time.Since(tPopulate), len(mats)) tBuildIndex := time.Now() var matrefs []tk.M copy(mats, matrefs) byId := make(map[string][]string, 0) byNames := make(map[string][]string, 0) exactNameCount := 0 fmt.Print("2. Build index ... ") for _, matl := range mats { matName := matl["Trimmed"].(string) matId := matl["Matnr"].(string) if _, byNameExist := byNames[matName]; !byNameExist { byNames[matName] = []string{matId} byId[matId] = []string{matId} } else { firstId := byNames[matName][0] if firstId != matId { byNames[matName] = append(byNames[matName], matId) copy(byId[byNames[matName][0]], byNames[matName]) if len(byNames[matName]) == 2 { exactNameCount++ } } } } fmt.Printf("Done (%v). Found %d index, %d are duplicated using exactly same description\n", time.Since(tBuildIndex), len(byNames), exactNameCount) fmt.Println("3. Saving data ... ") tSave := time.Now() recordCount := len(byNames) DbCtx().Connection.Query().From("Items").Delete().Run(nil) que := pque.NewQue() que.WorkerCount = 50 que.Fn = func(in interface{}) interface{} { m := in.(tk.M) DbCtx().Connection.Query().From("Items").Save().Run(tk.M{"data": m}) return m } que.FnDone = func(in interface{}) { m := in.(tk.M) fmt.Printf("Saving %s, completed: %3.1f pct \n", m["title"], float64(que.CompletedJob*100)/float64(recordCount)) } que.WaitForKeys() for matname, ids := range byNames { m := tk.M{} m.Set("_id", ids[0]) m.Set("title", matname) if len(ids) == 1 { m.Set("duplicated", 0) } else { m.Set("duplicated", 1) m.Set("duplicateid", ids[1:]) } que.SendKey(m) } que.KeySendDone() que.WaitForCompletion() fmt.Printf("Saving %d records in %v \n", que.CompletedJob, time.Since(tSave)) //FindSimilarity_1(mats) fmt.Printf("All process completed in %v \n", time.Since(tStart)) }