func TestScraper(t *testing.T) {
	Convey("Testing the scraper to search within wikipedia", t, func() {
		rb := new(scraper.RequestBit)
		rb.Url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=" + url.QueryEscape("database") + "&format=json"

		rb.ResponseObjectInterface = new(requestStruct.WikiSearch)
		rb.Work()
		response := *rb.ResponseObjectInterface.(*requestStruct.WikiSearch)

		Convey("should have the correct url", func() {
			So(rb.Url, ShouldEqual, "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=database&format=json")
		})

		Convey("should store the plain response string", func() {
			So(rb.PlainResponse, ShouldContainSubstring, `"title":"Database","snippet":"A <span class='searchmatch'>database</span> is an organized collection of data .`)
			So(rb.PlainResponse, ShouldContainSubstring, `{"query-continue":{"search":{"sroffset":10}},"query":{"searchinfo":`)
		})

		Convey("should store the given struct as a the response object", func() {
			log.Printf("Length word list: %+v", rb.ResponseObjectRawJson)
			So(reflect.TypeOf(rb.ResponseObjectRawJson).String(), ShouldEqual, "json.RawMessage")
			So(reflect.TypeOf(rb.ResponseArrayRawJson).String(), ShouldEqual, "[]json.RawMessage")
		})

		Convey("should Unmarschal the actual response into the response object and the first result should be ", func() {
			So(response.Query.Search[0].Title, ShouldEqual, "Database")
			So(len(response.Query.Search), ShouldBeGreaterThan, 0)
		})
	})

	Convey("Testing the scraper to get a page from wikipedia", t, func() {
		rb := new(scraper.RequestBit)
		rb.Url = "http://en.wikipedia.org/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles=Yanqing_County"
		rb.ResponseObjectInterface = new(requestStruct.WikiPage)
		rb.Work()
		response := *rb.ResponseObjectInterface.(*requestStruct.WikiPage)

		Convey("should have the correct url", func() {
			So(rb.Url, ShouldEqual, "http://en.wikipedia.org/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles=Yanqing_County")
		})

		Convey("should store the plain response string", func() {
			So(rb.PlainResponse, ShouldNotEqual, nil)
		})

		Convey("should store the given struct as a the response object", func() {
			So(reflect.TypeOf(response).String(), ShouldEqual, "requestStruct.WikiPage")
		})

		Convey("should Unmarschal the actual response into the response object", func() {
			So(response.Query.Pages["2256752"].Title, ShouldEqual, "Yanqing County")
			So(response.Query.Pages["2256752"].PageId, ShouldEqual, 2256752)
		})

		Convey("should Unmarschal the actual response into the response object and contain the wanted data", func() {
			So(response.Query.Pages["2256752"].Rev[0].RawContent, ShouldContainSubstring, `name = {{raise|0.2em|Yanqing County}}`)
		})
	})
}
Ejemplo n.º 2
0
/**
 * will get the content of a wikipedia page
 */
func GetWikipediaPage(firstPage string) string {
	// lets create a new http request object
	rb := new(scraper.RequestBit)
	rb.Url = "http://en.wikipedia.org/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles=" + url.QueryEscape(firstPage)
	glog.Infof("Url crawling in SearchWikipedia: %+v", rb.Url)

	// inject the struct for the json response
	rb.ResponseObjectInterface = new(requestStruct.WikiPage)
	rb.Work() // fire the request

	// for type assertion we need to explicite set the type of the returned interface object again
	// @TODO it would be nice to work without an interface here at all, but on the other hand to be flexible on the struct
	w2 := *rb.ResponseObjectInterface.(*requestStruct.WikiPage)

	// as the attribute 'Pages' is a map we neet to iterate trough it and return the first result, assuming this one is the page content
	for _, value := range w2.Query.Pages {
		return value.Rev[0].RawContent
	}

	// otherwise return an empty string
	return ""
}
Ejemplo n.º 3
0
/**
 * will search wikipedia for a search term and return existing matching pages
 * will be outdated soon!
 *
 * NOT IN USE CURRENTLY
 */
func OpenSearchWikipedia(searchTerm string) []string {
	// lets create a new http request object
	rb := new(scraper.RequestBit)

	rb.Url = "http://en.wikipedia.org/w/api.php?action=opensearch&search=" + url.QueryEscape(searchTerm) + "&format=json&limit=3"
	glog.Infof("Url crawling in SearchWikipedia: %+v", rb.Url)
	rb.Work() // fire the request

	// as wikipedia returns a sh*t formatted json we need to assign the result in two steps
	wikiOpenSearch := new(requestStruct.WikiOpenSearch)

	// first step is to assign the result term which is the first item in the returned array
	if err := json.Unmarshal(rb.ResponseArrayRawJson[0], &wikiOpenSearch.SearchTerm); err != nil {
		glog.Fatalf("expect string: %+v", err)
	}

	// second step is to assign the second item into the 'Results' array
	if err := json.Unmarshal(rb.ResponseArrayRawJson[1], &wikiOpenSearch.Results); err != nil {
		glog.Fatalf("expect []string: %+v", err)
	}

	return wikiOpenSearch.Results
}
Ejemplo n.º 4
0
/**
 * will search wikipedia for a search term and return existing matching pages
 */
func SearchWikipedia(searchTerm string) []string {
	// lets create a new http request object
	rb := new(scraper.RequestBit)

	rb.Url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=" + url.QueryEscape(searchTerm) + "&format=json"
	glog.Infof("Url crawling in SearchWikipedia: %+v", rb.Url)

	// inject the struct for the json response
	rb.ResponseObjectInterface = new(requestStruct.WikiSearch)
	rb.Work() // fire the request

	w2 := *rb.ResponseObjectInterface.(*requestStruct.WikiSearch)

	var results []string

	for _, value := range w2.Query.Search {
		// results[len(results)] = value.Rev[0].RawContent
		// http://blog.golang.org/slices
		results = append(results, value.Title)
	}

	return results
}