Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/regular-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: 1.22.0
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
go-version: 'stable'
- uses: golangci/golangci-lint-action@v7
with:
version: v1.58
version: 'v2.1.6'
install-mode: goinstall
format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: 1.22.0
go-version: 1.25.4
- run: go fmt ./...
29 changes: 22 additions & 7 deletions crawler/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ var collectionTypes = map[string]string{
// Crawl is the public method that initializes the recursive crawling.
func (c *Crawler) Collect(pageURL ...string) ([]string, error) {
c.mode = "collect"
c.compileCollections()
if err := c.compileCollections(); err != nil {
return nil, fmt.Errorf("failed to compile collection patterns: %w", err)
}
c.wg = sync.WaitGroup{}
c.errors = []error{} // Initialize errors slice
for _, url := range c.Selectors.ExcludedUrls {
Expand Down Expand Up @@ -56,18 +58,19 @@ func (c *Crawler) Collect(pageURL ...string) ([]string, error) {
return slices.Compact(c.collectedItems), nil
}

func (c *Crawler) compileCollections() {
func (c *Crawler) compileCollections() error {
for _, collectionType := range c.Selectors.Collections {
pattern, exists := collectionTypes[collectionType]
if !exists {
pattern = fmt.Sprintf(`([https?:]|\/)[^\s()'"]+\.(?:%v)`, collectionType)
}
regex, err := regexp.Compile(pattern)
if err != nil {
fmt.Println("Error compiling regex pattern")
return fmt.Errorf("error compiling regex pattern for collection type '%s': %w", collectionType, err)
}
c.regexPatterns = append(c.regexPatterns, *regex)
}
return nil
}

// recursiveCrawl is a private method that performs the recursive crawling, respecting MaxDepth.
Expand All @@ -89,9 +92,13 @@ func (c *Crawler) recursiveCollect(pageURL string, currentDepth int) error {
c.pagesContent[pageURL] = ""
c.mutex.Unlock()
htmlContent, err := c.FetchHTML(pageURL, useJavascript)

if err != nil {
return fmt.Errorf("error fetching HTML for %s: %w", pageURL, err)
// Log transient HTTP errors but don't fail the entire crawl
// These are expected when scraping (403, 404, network issues, etc.)
if !c.Silent {
fmt.Printf("Warning: failed to fetch %s: %v\n", pageURL, err)
}
return nil // Continue crawling other pages
}
if currentDepth > 0 && len(c.Selectors.ContentPatterns) > 0 {
matchContentPattern := false
Expand All @@ -109,11 +116,19 @@ func (c *Crawler) recursiveCollect(pageURL string, currentDepth int) error {
c.mutex.Unlock()
links, err := c.extractLinks(htmlContent)
if err != nil {
return err
// HTML parsing errors are common with malformed HTML - log but continue
if !c.Silent {
fmt.Printf("Warning: failed to extract links from %s: %v\n", pageURL, err)
}
return nil // Continue with other pages
}
items, err := c.extractItems(htmlContent, pageURL)
if err != nil {
return err
// HTML parsing errors are common - log but continue
if !c.Silent {
fmt.Printf("Warning: failed to extract items from %s: %v\n", pageURL, err)
}
return nil // Continue with other pages
}
c.mutex.Lock()
c.collectedItems = append(c.collectedItems, items...)
Expand Down
10 changes: 3 additions & 7 deletions crawler/collect_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package crawler

import (
"fmt"
"reflect"
"testing"

Expand Down Expand Up @@ -100,7 +99,8 @@ func TestExtractItems(t *testing.T) {
c := NewCrawler()
c.Selectors = *tt.s
c.mode = "collect"
c.compileCollections()
err := c.compileCollections()
assert.NoError(t, err)
for key, html := range tt.html {
assert.Contains(t, tt.want, key)
got, _ := c.extractItems(html, "https://www.domain.com")
Expand All @@ -127,7 +127,7 @@ func Benchmark_collectionSearch(b *testing.B) {
Ids: []string{},
}
c.mode = "collect"
c.compileCollections()
_ = c.compileCollections() // Ignore error in benchmark
for i := 0; i < b.N; i++ {
_, _ = c.extractItems(testHtml, "https://www.domain.com")
}
Expand All @@ -140,10 +140,6 @@ func TestSingleSourceRunCollectHtml(t *testing.T) {
c.MaxDepth = 1
c.MaxLinks = 3
results, err := c.Collect("https://www.cnn.com/")
fmt.Println(err)
for _, result := range results {
fmt.Println(result)
}
assert.Equal(t, nil, err)
// With MaxLinks=3 and MaxDepth=1, we should get at least the starting URL plus some links
assert.GreaterOrEqual(t, len(results), 1, "Should collect at least the starting page URL")
Expand Down
33 changes: 28 additions & 5 deletions crawler/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
func (c *Crawler) Crawl(pageURL ...string) (map[string]string, error) {
c.mode = "crawl"
c.wg = sync.WaitGroup{}
c.errors = []error{} // Initialize errors slice
for _, url := range c.Selectors.ExcludedUrls {
c.pagesContent[url] = ""
}
Expand All @@ -20,7 +21,12 @@ func (c *Crawler) Crawl(pageURL ...string) (map[string]string, error) {
defer c.wg.Done()
err := c.recursiveCrawl(url, 1)
if err != nil {
fmt.Printf("Error crawling %s: %v\n", url, err)
c.mutex.Lock()
c.errors = append(c.errors, err)
c.mutex.Unlock()
if !c.Silent {
fmt.Printf("Error crawling %s: %v\n", url, err)
}
}
}(url)
}
Expand All @@ -32,6 +38,10 @@ func (c *Crawler) Crawl(pageURL ...string) (map[string]string, error) {
}
}

// Return the first error if any occurred (but still return the results)
if len(c.errors) > 0 {
return c.pagesContent, c.errors[0]
}
return c.pagesContent, nil
}

Expand All @@ -58,8 +68,12 @@ func (c *Crawler) recursiveCrawl(pageURL string, currentDepth int) error {

htmlContent, err := c.FetchHTML(pageURL, useJavascript)
if err != nil {
fmt.Println(err)
return nil // return nil on page load error because the site could be down
// Log transient HTTP errors but don't fail the entire crawl
// These are expected when scraping (403, 404, network issues, etc.)
if !c.Silent {
fmt.Printf("Warning: failed to fetch %s: %v\n", pageURL, err)
}
return nil // Continue crawling other pages
}

if currentDepth > 0 && len(c.Selectors.ContentPatterns) > 0 {
Expand All @@ -84,7 +98,11 @@ func (c *Crawler) recursiveCrawl(pageURL string, currentDepth int) error {

links, err := c.extractLinks(htmlContent)
if err != nil {
return err
// HTML parsing errors are common with malformed HTML - log but continue
if !c.Silent {
fmt.Printf("Warning: failed to extract links from %s: %v\n", pageURL, err)
}
return nil // Continue with other pages
}

// Limit the number of concurrent goroutines based on Threads
Expand All @@ -111,7 +129,12 @@ func (c *Crawler) recursiveCrawl(pageURL string, currentDepth int) error {
}()
err := c.recursiveCrawl(url, currentDepth+1)
if err != nil {
fmt.Printf("Error crawling %s: %v\n", url, err)
c.mutex.Lock()
c.errors = append(c.errors, err)
c.mutex.Unlock()
if !c.Silent {
fmt.Printf("Error crawling %s: %v\n", url, err)
}
}
}(fullURL)
}
Expand Down
25 changes: 17 additions & 8 deletions crawler/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,27 @@ func (c *Crawler) FetchHTML(pageURL string, javascriptEnabled bool) (string, err
if javascriptEnabled {
html, err := browser.GetHtmlContent(pageURL)
if err != nil {
fmt.Println(err)
// Browser errors are returned to caller for handling
// Caller will decide if it's transient or critical
return html, err
}
return html, err
} else {
return c.requestPage(pageURL)
return html, nil
}
return c.requestPage(pageURL)
}

func (c *Crawler) requestPage(pageURL string) (string, error) {
resp, err := http.Get(pageURL)
if err != nil {
return "", err
// Network errors are transient - return for caller to handle
return "", fmt.Errorf("network error fetching %s: %w", pageURL, err)
}
defer resp.Body.Close()
defer func() {
_ = resp.Body.Close() // Ignore close errors on HTTP response body
}()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("status code error: %d %s", resp.StatusCode, resp.Status)
// HTTP errors (403, 404, 500, etc.) are transient in scraping context
return "", fmt.Errorf("HTTP %d %s for %s", resp.StatusCode, resp.Status, pageURL)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
Expand Down Expand Up @@ -180,7 +185,11 @@ func (c *Crawler) performSearch(n *html.Node, pageUrl string) []string {
items := []string{}
htmlString, err := nodeToString(n)
if err != nil {
fmt.Println("error converting node to string", err)
// Node rendering errors are edge cases - log but continue
if !c.Silent {
fmt.Printf("Warning: error converting node to string: %v\n", err)
}
return items // Return empty slice, continue processing other nodes
}
for _, re := range c.regexPatterns {
foundItems := re.FindAllString(htmlString, -1)
Expand Down
9 changes: 8 additions & 1 deletion crawler/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@ import (
)

// toAbsoluteURL converts a relative URL to an absolute URL based on the base URL.
// URL parsing errors are logged but the function continues with best-effort conversion.
func toAbsoluteURL(base, link string) string {
// Handle protocol-relative URLs (starting with //)
if strings.HasPrefix(link, "//") {
baseURL, err := url.Parse(base)
if err != nil {
// Invalid base URL - return link as-is (best effort)
return link
}
return baseURL.Scheme + ":" + link
}
u, err := url.Parse(link)
if err != nil {
// Invalid link URL - return as-is (best effort)
return link
}
if u.IsAbs() {
Expand All @@ -25,22 +28,26 @@ func toAbsoluteURL(base, link string) string {
if strings.HasPrefix(link, "/") {
baseURL, err := url.Parse(base)
if err != nil {
// Invalid base URL - try to construct from domain
return "https://" + getDomain(base) + link
}
return baseURL.Scheme + "://" + baseURL.Host + link
}
baseURL, err := url.Parse(base)
if err != nil {
// Invalid base URL - return base as fallback
return base
}
resolved := baseURL.ResolveReference(u)
return resolved.String()
}

// getDomain returns the domain of a URL.
// Returns empty string if URL parsing fails (invalid URL).
func getDomain(pageURL string) string {
u, err := url.Parse(pageURL)
if err != nil {
// Invalid URL - return empty string (caller should handle)
return ""
}
return u.Host
Expand Down Expand Up @@ -97,7 +104,7 @@ func (c *Crawler) linkTextCheck(link, linkText string) bool {

func (c *Crawler) validDomainCheck(fullURL string) bool {
// Handle protocol-relative URLs by checking if it starts with // or has a scheme
if !(strings.HasPrefix(fullURL, "https://") || strings.HasPrefix(fullURL, "http://") || strings.HasPrefix(fullURL, "//")) {
if !strings.HasPrefix(fullURL, "https://") && !strings.HasPrefix(fullURL, "http://") && !strings.HasPrefix(fullURL, "//") {
return false
}
// Convert protocol-relative URLs to absolute for domain checking
Expand Down
Loading