Go Web Scraping

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// Perform an HTTP request to load a page and search for a string
package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"strings"
	"time"
)

func main() {
	// Load command line arguments
	if len(os.Args) != 3 {
		fmt.Println("Search for a keyword in the contents of a URL")
		fmt.Println("Usage: " + os.Args[0] + " <url> <keyword>")
		fmt.Println("Example: " + os.Args[0] + " https://www.devdungeon.com NanoDano")
		os.Exit(1)
	}
	url := os.Args[1]
	needle := os.Args[2]

	// Create a custom http client to override default settings. Optional step.
	// Use http.Get() instead of client.Get() to use default client.
	client := &http.Client{
		Timeout: 30 * time.Second, // Default is forever!
		// CheckRedirect - Policy for following HTTP redirects
		// Jar - Cookie jar holding cookies
		// Transport - Change default method for making request
	}

	response, err := client.Get(url)
	if err != nil {
		log.Fatal("Error fetching URL. ", err)
	}

	// Read response body
	body, err := ioutil.ReadAll(response.Body)
	if err != nil {
		log.Fatal("Error reading HTTP body. ", err)
	}

	// Search for string
	if strings.Contains(string(body), needle) {
		fmt.Println("Match found for " + needle + " in URL " + url)
	} else {
		fmt.Println("No match found for " + needle + " in URL " + url)
	}
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// Search through a URL and find mailto links with email addresses
package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"regexp"
)

func main() {
	// Load command line arguments
	if len(os.Args) != 2 {
		fmt.Println("Search for emails in a URL")
		fmt.Println("Usage: " + os.Args[0] + " <url>")
		fmt.Println("Example: " + os.Args[0] + " https://www.devdungeon.com")
		os.Exit(1)
	}
	url := os.Args[1]

	// Fetch the URL
	response, err := http.Get(url)
	if err != nil {
		log.Fatal("Error fetching URL. ", err)
	}

	// Read the response
	body, err := ioutil.ReadAll(response.Body)
	if err != nil {
		log.Fatal("Error reading HTTP body. ", err)
	}

	// Look for mailto: links using a regular expression
	re := regexp.MustCompile("\"mailto:.*?[?\"]")
	matches := re.FindAllString(string(body), -1)
	if matches == nil {
		// Clean exit if no matches found
		fmt.Println("No emails found.")
		os.Exit(0)
	}

	// Print all emails found
	for _, match := range matches {
		// Remove "mailto prefix and the trailing quote or question mark
		// by performing a slice operation to extract the substring
		cleanedMatch := match[8 : len(match)-1]
		fmt.Println(cleanedMatch)
	}
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// Perform an HTTP HEAD request on a URL and print out headers
package main

import (
	"fmt"
	"log"
	"net/http"
	"os"
)

func main() {
	// Load URL from command line arguments
	if len(os.Args) != 2 {
		fmt.Println(os.Args[0] + " - Perform an HTTP HEAD request to a URL")
		fmt.Println("Usage: " + os.Args[0] + " <url>")
		fmt.Println("Example: " + os.Args[0] + " https://www.devdungeon.com")
		os.Exit(1)
	}
	url := os.Args[1]

	// Perform HTTP HEAD
	response, err := http.Head(url)
	if err != nil {
		log.Fatal("Error fetching URL. ", err)
	}

	// Print out each header key and value pair
	for key, value := range response.Header {
		fmt.Printf("%s: %s\n", key, value[0])
	}
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
)

var url = "https://www.example.com"

func main() {
	// Create the HTTP request
	request, err := http.NewRequest("GET", url, nil)
	if err != nil {
		log.Fatal("Error creating HTTP request. ", err)
	}

	// Set cookie
	request.Header.Set("Cookie", "session_id=<SESSION_TOKEN>")

	// Create the HTTP client, make request and print response
	httpClient := &http.Client{}
	response, err := httpClient.Do(request)
	data, err := ioutil.ReadAll(response.Body)
	fmt.Printf("%s\n", data)
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
// Search through a URL and find HTML comments
package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"regexp"
)

func main() {
	// Load command line arguments
	if len(os.Args) != 2 {
		fmt.Println("Search for HTML comments in a URL")
		fmt.Println("Usage: " + os.Args[0] + " <url>")
		fmt.Println("Example: " + os.Args[0] + " https://www.devdungeon.com")
		os.Exit(1)
	}
	url := os.Args[1]

	// Fetch the URL and get response
	response, err := http.Get(url)
	if err != nil {
		log.Fatal("Error fetching URL. ", err)
	}
	body, err := ioutil.ReadAll(response.Body)
	if err != nil {
		log.Fatal("Error reading HTTP body. ", err)
	}

	// Look for HTML comments using a regular expression
	re := regexp.MustCompile("<!--(.|\n)*?-->")
	matches := re.FindAllString(string(body), -1)
	if matches == nil {
		// Clean exit if no matches found
		fmt.Println("No HTML comments found.")
		os.Exit(0)
	}

	// Print all HTML comments found
	for _, match := range matches {

		//cleanedMatch := match[4 : len(match)-1]
		fmt.Println(match)
	}
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Look for unlisted files on a domain
package main

import (
	"bufio"
	"fmt"
	"log"
	"net/http"
	"net/url"
	"os"
	"strconv"
)

// Given a base URL (protocol+hostname) and a filepath (relative URL)
// perform an HTTP HEAD and see if the path exists.
// If the path returns a 200 OK print out the path
func checkIfUrlExists(baseUrl, filePath string, doneChannel chan bool) {
	// Create URL object from raw string
	targetUrl, err := url.Parse(baseUrl)
	if err != nil {
		log.Println("Error parsing base URL. ", err)
	}
	// Set the part of the URL after the host name
	targetUrl.Path = filePath

	// Perform a HEAD only, checking status without
	// downloading the entire file
	response, err := http.Head(targetUrl.String())
	if err != nil {
		log.Println("Error fetching ", targetUrl.String())
	}

	// If server returns 200 OK file can be downloaded
	if response.StatusCode == 200 {
		log.Println(targetUrl.String())
	}

	// Signal completion so next thread can start
	doneChannel <- true
}

func main() {
	// Load command line arguments
	if len(os.Args) != 4 {
		fmt.Println(os.Args[0] + " - Perform an HTTP HEAD request to a URL")
		fmt.Println("Usage: " + os.Args[0] +
			" <wordlist_file> <url> <maxThreads>")
		fmt.Println("Example: " + os.Args[0] +
			" wordlist.txt https://www.devdungeon.com 10")
		os.Exit(1)
	}
	wordlistFilename := os.Args[1]
	baseUrl := os.Args[2]
	maxThreads, err := strconv.Atoi(os.Args[3])
	if err != nil {
		log.Fatal("Error converting maxThread value to integer. ", err)
	}

	// Track how many threads are active to avoid
	// flooding a web server
	activeThreads := 0
	doneChannel := make(chan bool)

	// Open word list file for reading
	wordlistFile, err := os.Open(wordlistFilename)
	if err != nil {
		log.Fatal("Error opening wordlist file. ", err)
	}

	// Read each line and do an HTTP HEAD
	scanner := bufio.NewScanner(wordlistFile)
	for scanner.Scan() {
		go checkIfUrlExists(baseUrl, scanner.Text(), doneChannel)
		activeThreads++

		// Wait until a done signal before next if max threads reached
		if activeThreads >= maxThreads {
			<-doneChannel
			activeThreads -= 1
		}
	}

	// Wait for all threads before repeating and fetching a new batch
	for activeThreads > 0 {
		<-doneChannel
		activeThreads -= 1
	}

	// Scanner errors must be checked manually
	if err := scanner.Err(); err != nil {
		log.Fatal("Error reading wordlist file. ", err)
	}
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// Change HTTP user agent
package main

import (
	"log"
	"net/http"
)

func main() {
	// Create the request for use later
	client := &http.Client{}
	request, err := http.NewRequest("GET", "https://www.devdungeon.com", nil)
	if err != nil {
		log.Fatal("Error creating request. ", err)
	}

	// Override the user agent
	request.Header.Set("User-Agent", "_Custom User Agent_")

	// Perform the request, ignore response.
	_, err = client.Do(request)
	if err != nil {
		log.Fatal("Error making request. ", err)
	}
}

HTTP 响应头

Web 应用

相关内容