gtsteffaniak · gtsteffaniak · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/.github/workflows/regular-tests.yaml b/.github/workflows/regular-tests.yaml
@@ -20,16 +20,16 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version: 1.22.0
-      - name: golangci-lint
-        uses: golangci/golangci-lint-action@v6
+          go-version: 'stable'
+      - uses: golangci/golangci-lint-action@v7
         with:
-          version: v1.58
+          version: 'v2.1.6'
+          install-mode: goinstall
   format:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version: 1.22.0
+          go-version: 1.25.4
       - run: go fmt ./...
diff --git a/crawler/collect.go b/crawler/collect.go
@@ -28,7 +28,9 @@ var collectionTypes = map[string]string{
 // Crawl is the public method that initializes the recursive crawling.
 func (c *Crawler) Collect(pageURL ...string) ([]string, error) {
 	c.mode = "collect"
-	c.compileCollections()
+	if err := c.compileCollections(); err != nil {
+		return nil, fmt.Errorf("failed to compile collection patterns: %w", err)
+	}
 	c.wg = sync.WaitGroup{}
 	c.errors = []error{} // Initialize errors slice
 	for _, url := range c.Selectors.ExcludedUrls {
@@ -56,18 +58,19 @@ func (c *Crawler) Collect(pageURL ...string) ([]string, error) {
 	return slices.Compact(c.collectedItems), nil
 }
 
-func (c *Crawler) compileCollections() {
+func (c *Crawler) compileCollections() error {
 	for _, collectionType := range c.Selectors.Collections {
 		pattern, exists := collectionTypes[collectionType]
 		if !exists {
 			pattern = fmt.Sprintf(`([https?:]|\/)[^\s()'"]+\.(?:%v)`, collectionType)
 		}
 		regex, err := regexp.Compile(pattern)
 		if err != nil {
-			fmt.Println("Error compiling regex pattern")
+			return fmt.Errorf("error compiling regex pattern for collection type '%s': %w", collectionType, err)
 		}
 		c.regexPatterns = append(c.regexPatterns, *regex)
 	}
+	return nil
 }
 
 // recursiveCrawl is a private method that performs the recursive crawling, respecting MaxDepth.
@@ -89,9 +92,13 @@ func (c *Crawler) recursiveCollect(pageURL string, currentDepth int) error {
 	c.pagesContent[pageURL] = ""
 	c.mutex.Unlock()
 	htmlContent, err := c.FetchHTML(pageURL, useJavascript)
-
 	if err != nil {
-		return fmt.Errorf("error fetching HTML for %s: %w", pageURL, err)
+		// Log transient HTTP errors but don't fail the entire crawl
+		// These are expected when scraping (403, 404, network issues, etc.)
+		if !c.Silent {
+			fmt.Printf("Warning: failed to fetch %s: %v\n", pageURL, err)
+		}
+		return nil // Continue crawling other pages
 	}
 	if currentDepth > 0 && len(c.Selectors.ContentPatterns) > 0 {
 		matchContentPattern := false
@@ -109,11 +116,19 @@ func (c *Crawler) recursiveCollect(pageURL string, currentDepth int) error {
 	c.mutex.Unlock()
 	links, err := c.extractLinks(htmlContent)
 	if err != nil {
-		return err
+		// HTML parsing errors are common with malformed HTML - log but continue
+		if !c.Silent {
+			fmt.Printf("Warning: failed to extract links from %s: %v\n", pageURL, err)
+		}
+		return nil // Continue with other pages
 	}
 	items, err := c.extractItems(htmlContent, pageURL)
 	if err != nil {
-		return err
+		// HTML parsing errors are common - log but continue
+		if !c.Silent {
+			fmt.Printf("Warning: failed to extract items from %s: %v\n", pageURL, err)
+		}
+		return nil // Continue with other pages
 	}
 	c.mutex.Lock()
 	c.collectedItems = append(c.collectedItems, items...)

diff --git a/crawler/collect_test.go b/crawler/collect_test.go
@@ -1,7 +1,6 @@
 package crawler
 
 import (
-	"fmt"
 	"reflect"
 	"testing"
 
@@ -100,7 +99,8 @@ func TestExtractItems(t *testing.T) {
 			c := NewCrawler()
 			c.Selectors = *tt.s
 			c.mode = "collect"
-			c.compileCollections()
+			err := c.compileCollections()
+			assert.NoError(t, err)
 			for key, html := range tt.html {
 				assert.Contains(t, tt.want, key)
 				got, _ := c.extractItems(html, "https://www.domain.com")
@@ -127,7 +127,7 @@ func Benchmark_collectionSearch(b *testing.B) {
 		Ids:         []string{},
 	}
 	c.mode = "collect"
-	c.compileCollections()
+	_ = c.compileCollections() // Ignore error in benchmark
 	for i := 0; i < b.N; i++ {
 		_, _ = c.extractItems(testHtml, "https://www.domain.com")
 	}
@@ -140,10 +140,6 @@ func TestSingleSourceRunCollectHtml(t *testing.T) {
 	c.MaxDepth = 1
 	c.MaxLinks = 3
 	results, err := c.Collect("https://www.cnn.com/")
-	fmt.Println(err)
-	for _, result := range results {
-		fmt.Println(result)
-	}
 	assert.Equal(t, nil, err)
 	// With MaxLinks=3 and MaxDepth=1, we should get at least the starting URL plus some links
 	assert.GreaterOrEqual(t, len(results), 1, "Should collect at least the starting page URL")

diff --git a/crawler/crawl.go b/crawler/crawl.go
@@ -11,6 +11,7 @@ import (
 func (c *Crawler) Crawl(pageURL ...string) (map[string]string, error) {
 	c.mode = "crawl"
 	c.wg = sync.WaitGroup{}
+	c.errors = []error{} // Initialize errors slice
 	for _, url := range c.Selectors.ExcludedUrls {
 		c.pagesContent[url] = ""
 	}
@@ -20,7 +21,12 @@ func (c *Crawler) Crawl(pageURL ...string) (map[string]string, error) {
 			defer c.wg.Done()
 			err := c.recursiveCrawl(url, 1)
 			if err != nil {
-				fmt.Printf("Error crawling %s: %v\n", url, err)
+				c.mutex.Lock()
+				c.errors = append(c.errors, err)
+				c.mutex.Unlock()
+				if !c.Silent {
+					fmt.Printf("Error crawling %s: %v\n", url, err)
+				}
 			}
 		}(url)
 	}
@@ -32,6 +38,10 @@ func (c *Crawler) Crawl(pageURL ...string) (map[string]string, error) {
 		}
 	}
 
+	// Return the first error if any occurred (but still return the results)
+	if len(c.errors) > 0 {
+		return c.pagesContent, c.errors[0]
+	}
 	return c.pagesContent, nil
 }
 
@@ -58,8 +68,12 @@ func (c *Crawler) recursiveCrawl(pageURL string, currentDepth int) error {
 
 	htmlContent, err := c.FetchHTML(pageURL, useJavascript)
 	if err != nil {
-		fmt.Println(err)
-		return nil // return nil on page load error because the site could be down
+		// Log transient HTTP errors but don't fail the entire crawl
+		// These are expected when scraping (403, 404, network issues, etc.)
+		if !c.Silent {
+			fmt.Printf("Warning: failed to fetch %s: %v\n", pageURL, err)
+		}
+		return nil // Continue crawling other pages
 	}
 
 	if currentDepth > 0 && len(c.Selectors.ContentPatterns) > 0 {
@@ -84,7 +98,11 @@ func (c *Crawler) recursiveCrawl(pageURL string, currentDepth int) error {
 
 	links, err := c.extractLinks(htmlContent)
 	if err != nil {
-		return err
+		// HTML parsing errors are common with malformed HTML - log but continue
+		if !c.Silent {
+			fmt.Printf("Warning: failed to extract links from %s: %v\n", pageURL, err)
+		}
+		return nil // Continue with other pages
 	}
 
 	// Limit the number of concurrent goroutines based on Threads
@@ -111,7 +129,12 @@ func (c *Crawler) recursiveCrawl(pageURL string, currentDepth int) error {
 				}()
 				err := c.recursiveCrawl(url, currentDepth+1)
 				if err != nil {
-					fmt.Printf("Error crawling %s: %v\n", url, err)
+					c.mutex.Lock()
+					c.errors = append(c.errors, err)
+					c.mutex.Unlock()
+					if !c.Silent {
+						fmt.Printf("Error crawling %s: %v\n", url, err)
+					}
 				}
 			}(fullURL)
 		}

diff --git a/crawler/html.go b/crawler/html.go
@@ -26,22 +26,27 @@ func (c *Crawler) FetchHTML(pageURL string, javascriptEnabled bool) (string, err
 	if javascriptEnabled {
 		html, err := browser.GetHtmlContent(pageURL)
 		if err != nil {
-			fmt.Println(err)
+			// Browser errors are returned to caller for handling
+			// Caller will decide if it's transient or critical
+			return html, err
 		}
-		return html, err
-	} else {
-		return c.requestPage(pageURL)
+		return html, nil
 	}
+	return c.requestPage(pageURL)
 }
 
 func (c *Crawler) requestPage(pageURL string) (string, error) {
 	resp, err := http.Get(pageURL)
 	if err != nil {
-		return "", err
+		// Network errors are transient - return for caller to handle
+		return "", fmt.Errorf("network error fetching %s: %w", pageURL, err)
 	}
-	defer resp.Body.Close()
+	defer func() {
+		_ = resp.Body.Close() // Ignore close errors on HTTP response body
+	}()
 	if resp.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("status code error: %d %s", resp.StatusCode, resp.Status)
+		// HTTP errors (403, 404, 500, etc.) are transient in scraping context
+		return "", fmt.Errorf("HTTP %d %s for %s", resp.StatusCode, resp.Status, pageURL)
 	}
 	bodyBytes, err := io.ReadAll(resp.Body)
 	if err != nil {
@@ -180,7 +185,11 @@ func (c *Crawler) performSearch(n *html.Node, pageUrl string) []string {
 	items := []string{}
 	htmlString, err := nodeToString(n)
 	if err != nil {
-		fmt.Println("error converting node to string", err)
+		// Node rendering errors are edge cases - log but continue
+		if !c.Silent {
+			fmt.Printf("Warning: error converting node to string: %v\n", err)
+		}
+		return items // Return empty slice, continue processing other nodes
 	}
 	for _, re := range c.regexPatterns {
 		foundItems := re.FindAllString(htmlString, -1)

diff --git a/crawler/utils.go b/crawler/utils.go
@@ -6,17 +6,20 @@ import (
 )
 
 // toAbsoluteURL converts a relative URL to an absolute URL based on the base URL.
+// URL parsing errors are logged but the function continues with best-effort conversion.
 func toAbsoluteURL(base, link string) string {
 	// Handle protocol-relative URLs (starting with //)
 	if strings.HasPrefix(link, "//") {
 		baseURL, err := url.Parse(base)
 		if err != nil {
+			// Invalid base URL - return link as-is (best effort)
 			return link
 		}
 		return baseURL.Scheme + ":" + link
 	}
 	u, err := url.Parse(link)
 	if err != nil {
+		// Invalid link URL - return as-is (best effort)
 		return link
 	}
 	if u.IsAbs() {
@@ -25,22 +28,26 @@ func toAbsoluteURL(base, link string) string {
 	if strings.HasPrefix(link, "/") {
 		baseURL, err := url.Parse(base)
 		if err != nil {
+			// Invalid base URL - try to construct from domain
 			return "https://" + getDomain(base) + link
 		}
 		return baseURL.Scheme + "://" + baseURL.Host + link
 	}
 	baseURL, err := url.Parse(base)
 	if err != nil {
+		// Invalid base URL - return base as fallback
 		return base
 	}
 	resolved := baseURL.ResolveReference(u)
 	return resolved.String()
 }
 
 // getDomain returns the domain of a URL.
+// Returns empty string if URL parsing fails (invalid URL).
 func getDomain(pageURL string) string {
 	u, err := url.Parse(pageURL)
 	if err != nil {
+		// Invalid URL - return empty string (caller should handle)
 		return ""
 	}
 	return u.Host
@@ -97,7 +104,7 @@ func (c *Crawler) linkTextCheck(link, linkText string) bool {
 
 func (c *Crawler) validDomainCheck(fullURL string) bool {
 	// Handle protocol-relative URLs by checking if it starts with // or has a scheme
-	if !(strings.HasPrefix(fullURL, "https://") || strings.HasPrefix(fullURL, "http://") || strings.HasPrefix(fullURL, "//")) {
+	if !strings.HasPrefix(fullURL, "https://") && !strings.HasPrefix(fullURL, "http://") && !strings.HasPrefix(fullURL, "//") {
 		return false
 	}
 	// Convert protocol-relative URLs to absolute for domain checking