func TestScraper(t *testing.T) { Convey("Testing the scraper to search within wikipedia", t, func() { rb := new(scraper.RequestBit) rb.Url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=" + url.QueryEscape("database") + "&format=json" rb.ResponseObjectInterface = new(requestStruct.WikiSearch) rb.Work() response := *rb.ResponseObjectInterface.(*requestStruct.WikiSearch) Convey("should have the correct url", func() { So(rb.Url, ShouldEqual, "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=database&format=json") }) Convey("should store the plain response string", func() { So(rb.PlainResponse, ShouldContainSubstring, `"title":"Database","snippet":"A <span class='searchmatch'>database</span> is an organized collection of data .`) So(rb.PlainResponse, ShouldContainSubstring, `{"query-continue":{"search":{"sroffset":10}},"query":{"searchinfo":`) }) Convey("should store the given struct as a the response object", func() { log.Printf("Length word list: %+v", rb.ResponseObjectRawJson) So(reflect.TypeOf(rb.ResponseObjectRawJson).String(), ShouldEqual, "json.RawMessage") So(reflect.TypeOf(rb.ResponseArrayRawJson).String(), ShouldEqual, "[]json.RawMessage") }) Convey("should Unmarschal the actual response into the response object and the first result should be ", func() { So(response.Query.Search[0].Title, ShouldEqual, "Database") So(len(response.Query.Search), ShouldBeGreaterThan, 0) }) }) Convey("Testing the scraper to get a page from wikipedia", t, func() { rb := new(scraper.RequestBit) rb.Url = "http://en.wikipedia.org/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles=Yanqing_County" rb.ResponseObjectInterface = new(requestStruct.WikiPage) rb.Work() response := *rb.ResponseObjectInterface.(*requestStruct.WikiPage) Convey("should have the correct url", func() { So(rb.Url, ShouldEqual, "http://en.wikipedia.org/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles=Yanqing_County") }) Convey("should store the plain response string", func() { So(rb.PlainResponse, ShouldNotEqual, nil) }) Convey("should store the given struct as a the response object", func() { So(reflect.TypeOf(response).String(), ShouldEqual, "requestStruct.WikiPage") }) Convey("should Unmarschal the actual response into the response object", func() { So(response.Query.Pages["2256752"].Title, ShouldEqual, "Yanqing County") So(response.Query.Pages["2256752"].PageId, ShouldEqual, 2256752) }) Convey("should Unmarschal the actual response into the response object and contain the wanted data", func() { So(response.Query.Pages["2256752"].Rev[0].RawContent, ShouldContainSubstring, `name = {{raise|0.2em|Yanqing County}}`) }) }) }
/** * will get the content of a wikipedia page */ func GetWikipediaPage(firstPage string) string { // lets create a new http request object rb := new(scraper.RequestBit) rb.Url = "http://en.wikipedia.org/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles=" + url.QueryEscape(firstPage) glog.Infof("Url crawling in SearchWikipedia: %+v", rb.Url) // inject the struct for the json response rb.ResponseObjectInterface = new(requestStruct.WikiPage) rb.Work() // fire the request // for type assertion we need to explicite set the type of the returned interface object again // @TODO it would be nice to work without an interface here at all, but on the other hand to be flexible on the struct w2 := *rb.ResponseObjectInterface.(*requestStruct.WikiPage) // as the attribute 'Pages' is a map we neet to iterate trough it and return the first result, assuming this one is the page content for _, value := range w2.Query.Pages { return value.Rev[0].RawContent } // otherwise return an empty string return "" }
/** * will search wikipedia for a search term and return existing matching pages * will be outdated soon! * * NOT IN USE CURRENTLY */ func OpenSearchWikipedia(searchTerm string) []string { // lets create a new http request object rb := new(scraper.RequestBit) rb.Url = "http://en.wikipedia.org/w/api.php?action=opensearch&search=" + url.QueryEscape(searchTerm) + "&format=json&limit=3" glog.Infof("Url crawling in SearchWikipedia: %+v", rb.Url) rb.Work() // fire the request // as wikipedia returns a sh*t formatted json we need to assign the result in two steps wikiOpenSearch := new(requestStruct.WikiOpenSearch) // first step is to assign the result term which is the first item in the returned array if err := json.Unmarshal(rb.ResponseArrayRawJson[0], &wikiOpenSearch.SearchTerm); err != nil { glog.Fatalf("expect string: %+v", err) } // second step is to assign the second item into the 'Results' array if err := json.Unmarshal(rb.ResponseArrayRawJson[1], &wikiOpenSearch.Results); err != nil { glog.Fatalf("expect []string: %+v", err) } return wikiOpenSearch.Results }
/** * will search wikipedia for a search term and return existing matching pages */ func SearchWikipedia(searchTerm string) []string { // lets create a new http request object rb := new(scraper.RequestBit) rb.Url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=" + url.QueryEscape(searchTerm) + "&format=json" glog.Infof("Url crawling in SearchWikipedia: %+v", rb.Url) // inject the struct for the json response rb.ResponseObjectInterface = new(requestStruct.WikiSearch) rb.Work() // fire the request w2 := *rb.ResponseObjectInterface.(*requestStruct.WikiSearch) var results []string for _, value := range w2.Query.Search { // results[len(results)] = value.Rev[0].RawContent // http://blog.golang.org/slices results = append(results, value.Title) } return results }