forked from TransparencyToolkit/IndeedScraper
/
indeedscraper.go
115 lines (93 loc) · 2.53 KB
/
indeedscraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package indeedscraper
import (
"net/http"
"crypto/tls"
"io/ioutil"
"strings"
"strconv"
"encoding/json"
"github.com/moovweb/gokogiri"
)
//TODO:
// Add concurrency (in parsing and pages?)
// Add same for job listing
// Clean up result names/types
var overall []map[string]string
// Download all the resumes
func GetResumes(searchterm string, location string) string {
// Generate search URL
searchterm = cleanString(searchterm)
location = cleanString(location)
url := "http://indeed.com/resumes?"
// Add search term to URL
if searchterm != "" {
url += "q="+searchterm
}
// Add location to URL
if location != "" {
if strings.Contains(url, "?q="){
url += "&"
}
url += "l="+location
}
// Get page with results
body := getPage(url)
numPages := getPageCount(body)
// Loop through all pages
if numPages != 0 {
for i := 0; i < numPages; i++ {
getResults(url + "&start="+strconv.Itoa(i*50))
}
}
out, _ := json.MarshalIndent(overall, "", " ")
return string(out)
}
// Gets the results for a single page
func getResults(resultsurl string) {
body := getPage(resultsurl)
// Get a list of all profile links on page
doc, _ := gokogiri.ParseHtml(body)
results, _ := doc.NodeById("results").Search("//li[@itemtype='http://schema.org/Person']")
names, _ := results[0].Search("//a[@class='app_link']")
// Send link of each profile on page to parser
for _, profile := range(names){
parseProfile("http://indeed.com"+profile.Attr("href"))
}
}
// Gets the total number of result pages
func getPageCount(firstpage []uint8) int {
parsed, _ := gokogiri.ParseHtml(firstpage)
numresults, _ := parsed.Search("//div[@id='result_count']")
resultnums := strings.Split(numresults[0].InnerHtml(), " ")
var num int
if len(resultnums) >= 2 {
num, _ = strconv.Atoi(resultnums[1])
}
numpages := num/50
if num % 50 != 0 {
numpages += 1
}
return numpages
}
// Gets the body of a webpage
func getPage(url string) []uint8 {
// SSL config
tlsConfig := &tls.Config{
InsecureSkipVerify: true,
}
transport := &http.Transport{
TLSClientConfig: tlsConfig,
}
client := http.Client{Transport: transport}
// Get page for search term
resp, _ := client.Get(url)
defer resp.Body.Close()
body, _ := ioutil.ReadAll(resp.Body)
return body
}
// Format search string as needed for URL params
func cleanString(input_term string) string {
outstr := strings.Replace(input_term, " ", "+", -1)
outstr = strings.Replace(outstr, ",", "%2C", -1)
return outstr
}