// NameToNutrientMigration ... func NameToNutrientMigration(s *State) { for _, recipe := range s.Recipes { recipe.Nutrients = *SetDietaryInfo(&recipe.Nutrients, recipe.Name) fmt.Println(models.RemoveMetaData(recipe.Name)) recipe.Name = models.RemoveMetaData(recipe.Name) x := struct { Name string `json:"name"` Nutrients models.NutrientInfoResponse `json:"nutrients"` ID string `json:"objectId"` UUID string `json:"uuid"` }{ Name: models.RemoveMetaData(recipe.Name), Nutrients: *SetDietaryInfo(&recipe.Nutrients, recipe.Name), ID: recipe.ObjectID(), UUID: lib.GetMD5Hash(models.RemoveMetaData(recipe.Name)), } xString, _ := json.Marshal(x) fmt.Println(string(xString)) _, status, errs := s.DB.Put(x, "Recipe", recipe.ID) if errs != nil || status == 400 { log.Error(status) log.Error(errs) log.Error(errors.Errorf("Unable to post recipe with ID: %s", recipe.ID)) break } // time.Sleep(1 * time.Second) } }
func saveRecipes(s *State, v models.VenueInfo) { u := uniqueRecipes(v.Recipes) var duplicates, new int for _, recipe := range u { if s.Recipes[recipe.ID].DartmouthID != recipe.ID { c := models.CreatedBy{ Kind: "Pointer", ClassName: "_User", ObjectID: "95xfYTL7GG", } returnObj, status, errs := s.DB.Post(models.ParseRecipe{ Name: models.RemoveMetaData(recipe.Name), Category: recipe.Category, DartmouthID: recipe.ID, Rank: recipe.Rank, UUID: lib.GetMD5Hash(models.RemoveMetaData(recipe.Name)), Nutrients: *SetDietaryInfo(&recipe.Nutrients, recipe.Name), Class: "Recipe", CreatedBy: c, }) if errs != nil || status == 400 { log.Error(status) log.Error(errors.Errorf("Unable to post recipe with ID: %d", recipe.ID)) continue } returnedRecipe := returnObj.(models.ParseRecipe) s.Recipes[recipe.ID] = returnedRecipe log.Debug("Created new recipe with objectId: ", returnedRecipe.ObjectID()) new++ } else { duplicates++ } } log.WithFields(logrus.Fields{ "Saved": new, "Duplicate": duplicates, }).Info("Scraped Recipes") }
func scrape(c *cli.Context) { log.Info("Initializing Scraper") p := parse.Client{ BaseURL: "https://api.parse.com/1", ApplicationID: "BAihtNGpVTx4IJsuuFV5f9LibJGnD1ZBOsnXk9qp", Key: "zJYR2d3dFN3bXL6vUANZyoVLZ3bcTF7fpXTCrU7s", } s := State{ DB: &p, Recipes: make(map[int]models.ParseRecipe), Nutrients: make(map[int]bool), Offerings: make(map[string]models.ParseOffering), Subscriptions: make(map[int][]string), Notifications: make(map[string]models.ParseNotification), } if c.Bool("nameNutrientMigration") { fmt.Println("Running Migration...") InitParse(&s) NameToNutrientMigration(&s) return } if c.Bool("mock") { log.Info("Mocked Scrape") InitParse(&s) file, err := os.Open("output_DDS.json") if err != nil { log.Fatal(err) } info := models.VenueInfo{} if err := json.NewDecoder(file).Decode(&info); err != nil { log.Fatal(err) } saveToParse(&s, info) log.Info("End Mocked Scrape") return } InitParse(&s) pwd, err := os.Getwd() if err != nil { log.Fatal("Could not get working directory!") } if c.Bool("write-files") { fmt.Println() fmt.Println("Output files will be placed in", pwd) } rDate := "" template := "01/02/06" if rDate = c.String("startDate"); rDate != "" { rDate = c.String("startDate") } date, err := time.Parse(template, rDate) if err != nil { log.Fatal("Unable to parse date make sure it looks like MM/dd/YY") } dateArray := []time.Time{} for i := 0; i < 7; i++ { dateToAdd := date.AddDate(0, 0, i) dateArray = append(dateArray, dateToAdd) } shouldPost := c.Bool("save") notificationsToCreate := []models.Notification{} for _, date := range dateArray { log.WithFields(logrus.Fields{ "date": date.Format(template), }).Info("Start Scrape") // We want to get all Available SIDS sids, err := lib.AvailableSIDS() if err != nil { log.Fatal(err) } log.WithFields(logrus.Fields{ "count": len(sids), }).Info("SIDS") // How many nutrition routines we want to make at a time nutritionRoutines := 50 for key, value := range sids { throttleRequests := make(chan bool, nutritionRoutines) defer close(throttleRequests) log.WithFields(logrus.Fields{ "venue": key, }).Info("Venue Scrape") info := models.VenueInfo{ Date: date, } sid, err := lib.SID(key) if err != nil { log.Error(err) continue } info.Venue = value info.Key = key info.SID = sid info.Menus, err = lib.MenuList(sid) log.WithFields(logrus.Fields{ "count": len(info.Menus), }).Info("Got Menus") if err != nil { log.Error(err) continue } info.Meals, err = lib.MealList(sid) if err != nil { log.Error(err) } log.WithFields(logrus.Fields{ "count": len(info.Meals), }).Info("Got Meals") for _, meal := range info.Meals { menuMeal := models.MenuMeal{ Meal: meal, Menus: models.MenuInfoSlice{}, } for _, menu := range info.Menus { newRecipes, err := lib. RecipesMenuMealDate(sid, menu.ID, meal.ID, date) if err != nil { log.Error(err) continue } for _, recipe := range newRecipes { if len(s.Subscriptions[recipe.ID]) > 0 { notificationsToCreate = append(notificationsToCreate, models.Notification{ RecipeID: recipe.ID, Name: models.RemoveMetaData(recipe.Name), Day: date.Day(), Month: int(date.Month()), Year: date.Year(), OnDate: date, MenuName: menu.Name, MealName: meal.Name, Venue: info.Key, }) } } // We need to scrape the recipes so that we can create notifications // but if the offering exists then we can just skip everything else if offeringExists(&s, info.Key, menu.Name, meal.Name, date) { log.WithFields(logrus.Fields{ "meal": meal.ID, "menu": menu.ID, "venue": info.Key, "date": date.Format(template), }).Info("Offering Exists") // newRecipes, err := lib.RecipesMenuMealDate(sid, menu.ID, meal.ID, date) continue } if len(newRecipes) > 0 { menuMeal.Menus = append(menuMeal.Menus, menu) } info.Recipes = append(info.Recipes, newRecipes...) } info.MealsList = append(info.MealsList, menuMeal) } // This section is the part that benefits the most from concurrency // the top parts finish in about 5 seconds but this will take up to // 15 minutes if done one by one. log.WithFields(logrus.Fields{ "count": len(info.Recipes), }).Info("Start Recipe Scrape") for index := range info.Recipes { // Start a new goroutine for each nutrition request go func(key string, index int, info *models.VenueInfo) { // Read from the semaphore after we are done to free up a space for // the next connection. defer func() { <-throttleRequests }() // GetNutrients returns a pointer but we don't really care about it // simply ignore it. We pass &info.Recipes[index] so that the actual // pointer in the info object will be updated, otherwise a copy // will be worked on and we won't see the result _, err := lib.GetNutrients(info.SID, &info.Recipes[index]) if err != nil { log.Error(err) } }(key, index, &info) /// Add our request to the list of running requests. throttleRequests <- true } // We want to fill them up by default.. for i := 0; i < cap(throttleRequests); i++ { throttleRequests <- true } log.WithFields(logrus.Fields{ "count": len(info.Recipes), }).Info("Finish Recipe Scrape") if shouldPost { saveToParse(&s, info) } log.WithFields(logrus.Fields{ "venue": info.Key, }).Info("Finish Venue Scrape") // Write a file to the directory it is run under with the output if c.Bool("write-files") { fileName := fmt.Sprintf("output_%s.json", info.Key) filePath := path.Join(pwd, fileName) b, err := json.MarshalIndent(info, "", " ") if err != nil { fmt.Println("error:", err) } fmt.Println("Wrote to:", fileName) err = ioutil.WriteFile(filePath, b, 0644) if err != nil { log.Println(err) continue } } } } ns := createNotifications(&s, notificationsToCreate) saveNotifications(&s, ns) removeOldNotifications(&s) }