Skip to content

Commit 9cc69da

Browse files
committed
feat(cli): clean search, add extract/format flags; harden engines & proxy rotation; fix bugs; update docs
- Add structured `search [engine] [query]` CLI: --limit/--lang/--region/--site/--file, --format (json|text|markdown|ndjson), --extract N, --search-timeout; Envelope and route logs to stderr with a --quiet default (fixes stdout pollution) - Unify engines behind a single engineSpec registry (CLI + serve share it) - Unify the extract knob to bool-or-int `extract=N` (drop extract_top); CLI and HTTP share core batch extraction, raw/rendered fetch, and clamp helpers - Engines: Ecosia CF captcha detection (raw + browser), Yandex progressive-result wait, Google PAA poll + Has() existence probes, Bing title/desc attribute fallbacks - Proxy: rotate challenged proxies out of the tag pool for one retry (X-Proxy-Attempts); browser health-ping skip window; opt-in WaitStable
1 parent ca3143c commit 9cc69da

33 files changed

Lines changed: 1600 additions & 370 deletions

README.md

Lines changed: 218 additions & 37 deletions
Large diffs are not rendered by default.

bing/parse_html.go

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ func parseBingDocument(doc *goquery.Document) []core.SearchResult {
4747
return
4848
}
4949

50-
title := titleTag.Text()
50+
title := firstNonEmptyAttr(titleTag, "aria-label", "title")
51+
if title == "" {
52+
title = normalizeWhitespace(titleTag.Text())
53+
}
5154
if title == "" {
5255
title = extractFirstText(item, Selectors.TitleFallbacks)
5356
}
@@ -82,19 +85,30 @@ func parseBingDocument(doc *goquery.Document) []core.SearchResult {
8285
func extractFirstText(item *goquery.Selection, selectors []string) string {
8386
for _, selector := range selectors {
8487
if tag := item.Find(selector).First(); tag.Length() > 0 {
85-
if text := strings.TrimSpace(tag.Text()); text != "" {
88+
if text := normalizeWhitespace(tag.Text()); text != "" {
8689
return text
8790
}
88-
if label, exists := tag.Attr("aria-label"); exists {
89-
if label = strings.TrimSpace(label); label != "" {
90-
return label
91-
}
91+
if label := firstNonEmptyAttr(tag, "aria-label", "title"); label != "" {
92+
return label
9293
}
9394
}
9495
}
9596
return ""
9697
}
9798

99+
func firstNonEmptyAttr(item *goquery.Selection, attrs ...string) string {
100+
for _, attr := range attrs {
101+
value, exists := item.Attr(attr)
102+
if !exists {
103+
continue
104+
}
105+
if value = normalizeWhitespace(value); value != "" {
106+
return value
107+
}
108+
}
109+
return ""
110+
}
111+
98112
// descriptionFromItem extracts a description using the same 4-step fallback
99113
// chain as the rod-based browser parser. Bing renders snippet text with heavy
100114
// source-indentation whitespace, so each candidate is whitespace-collapsed.
@@ -118,8 +132,8 @@ func descriptionFromItem(item *goquery.Selection, title string) string {
118132
return normalizeWhitespace(strings.Replace(item.Text(), title, "", 1))
119133
}
120134

121-
// normalizeWhitespace collapses runs of whitespace (including the newlines and
122-
// indentation Bing leaves in snippet markup) into single spaces.
135+
// normalizeWhitespace collapses Bing's snippet-markup whitespace into single
136+
// spaces (see core.NormalizeWhitespace).
123137
func normalizeWhitespace(s string) string {
124-
return strings.Join(strings.Fields(s), " ")
138+
return core.NormalizeWhitespace(s)
125139
}

bing/parse_html_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,49 @@ func TestParseBingHTMLTitleFallback(t *testing.T) {
145145
t.Fatalf("title = %q, want fallback", results[0].Title)
146146
}
147147
}
148+
149+
func TestParseBingHTMLPrefersTitleAttribute(t *testing.T) {
150+
t.Parallel()
151+
152+
html := `
153+
<ol id="b_results">
154+
<li class="b_algo">
155+
<h2><a aria-label="Real SERP Title" href="https://example.com/result">example.com</a></h2>
156+
<div class="b_caption"><p>Snippet</p></div>
157+
</li>
158+
</ol>`
159+
160+
results, err := ParseHTML(bytes.NewReader([]byte(html)))
161+
if err != nil {
162+
t.Fatalf("ParseHTML() error = %v", err)
163+
}
164+
if len(results) != 1 {
165+
t.Fatalf("expected 1 result, got %d", len(results))
166+
}
167+
if results[0].Title != "Real SERP Title" {
168+
t.Fatalf("title = %q, want attribute title", results[0].Title)
169+
}
170+
}
171+
172+
func TestParseBingHTMLDescriptionFallsThroughEmptyPrimary(t *testing.T) {
173+
t.Parallel()
174+
175+
html := `
176+
<ol id="b_results">
177+
<li class="b_algo">
178+
<h2><a href="https://example.com/result">Result title</a></h2>
179+
<div class="b_caption"><p> </p><div>Useful snippet text</div></div>
180+
</li>
181+
</ol>`
182+
183+
results, err := ParseHTML(bytes.NewReader([]byte(html)))
184+
if err != nil {
185+
t.Fatalf("ParseHTML() error = %v", err)
186+
}
187+
if len(results) != 1 {
188+
t.Fatalf("expected 1 result, got %d", len(results))
189+
}
190+
if results[0].Description != "Useful snippet text" {
191+
t.Fatalf("description = %q, want fallback snippet", results[0].Description)
192+
}
193+
}

bing/search.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ func (bing *Bing) checkCaptcha(page *rod.Page) bool {
6666
}
6767

6868
func (bing *Bing) acceptCookies(ctx context.Context, page *rod.Page) error {
69+
// Probe first so a banner-less SERP returns immediately instead of blocking
70+
// .Element for the full Timeout/10.
71+
if has, _, err := page.Has(Selectors.CookieBtn); err != nil || !has {
72+
return nil
73+
}
6974
consentBtn, err := page.Timeout(bing.Timeout / 10).Element(Selectors.CookieBtn)
7075
if err != nil {
7176
return nil
@@ -107,8 +112,10 @@ func (bing *Bing) parseResultElement(el *rod.Element, isAd bool, rank, absoluteR
107112
return core.SearchResult{}, false
108113
}
109114

110-
title, _ := titleElem.Text()
111-
title = strings.TrimSpace(title)
115+
title := core.ElementAttribute(titleElem, "aria-label", "title")
116+
if title == "" {
117+
title = core.ElementText(titleElem)
118+
}
112119
if title == "" {
113120
title = core.FirstNonEmptyText(el, Selectors.TitleFallbacks...)
114121
}
@@ -120,24 +127,18 @@ func (bing *Bing) parseResultElement(el *rod.Element, isAd bool, rank, absoluteR
120127
return core.SearchResult{}, false
121128
}
122129

123-
desc := ""
124-
if descElem, err := el.Element(Selectors.DescPrimary); err == nil {
125-
desc, _ = descElem.Text()
126-
} else if descElem, err := el.Element(Selectors.DescFallback); err == nil {
127-
desc, _ = descElem.Text()
128-
} else if descElem, err := el.Element(Selectors.DescAny); err == nil {
129-
desc, _ = descElem.Text()
130-
} else {
130+
desc := core.FirstNonEmptyText(el, Selectors.DescPrimary, Selectors.DescFallback, Selectors.DescAny)
131+
if desc == "" {
131132
fullText, _ := el.Text()
132-
desc = strings.TrimSpace(strings.Replace(fullText, title, "", 1))
133+
desc = core.NormalizeWhitespace(strings.Replace(fullText, title, "", 1))
133134
}
134135

135136
return core.SearchResult{
136137
Rank: rank,
137138
AbsoluteRank: absoluteRank,
138139
URL: url,
139140
Title: title,
140-
Description: strings.TrimSpace(desc),
141+
Description: desc,
141142
Ad: isAd,
142143
}, true
143144
}

cmd/engines.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package cmd
2+
3+
import (
4+
"context"
5+
"io"
6+
7+
"github.com/karust/openserp/baidu"
8+
"github.com/karust/openserp/bing"
9+
"github.com/karust/openserp/core"
10+
"github.com/karust/openserp/duckduckgo"
11+
"github.com/karust/openserp/ecosia"
12+
"github.com/karust/openserp/google"
13+
"github.com/karust/openserp/yandex"
14+
)
15+
16+
// engineSpec is the single registry row for a search engine, driving CLI search,
17+
// raw dispatch, serve's browserEngineSpecs, and the alias/validation strings.
18+
// cfg points into the live config global; rawSearchFn is nil when an engine has
19+
// no browserless mode.
20+
type engineSpec struct {
21+
name string
22+
aliases []string
23+
factory func(core.Browser, core.SearchEngineOptions) core.SearchEngine
24+
rawSearchFn func(context.Context, core.Query) ([]core.SearchResult, error)
25+
parseHTMLFn func(io.Reader) ([]core.SearchResult, error)
26+
cfg *EngineConfig
27+
}
28+
29+
func (s engineSpec) opts() core.SearchEngineOptions {
30+
return s.cfg.SearchEngineOptions
31+
}
32+
33+
func engineSpecs() []engineSpec {
34+
return []engineSpec{
35+
{name: "google", factory: newEngine(google.New), rawSearchFn: google.Search, parseHTMLFn: google.ParseHTML, cfg: &config.GoogleConfig},
36+
{name: "yandex", factory: newEngine(yandex.New), rawSearchFn: yandex.Search, parseHTMLFn: yandex.ParseHTML, cfg: &config.YandexConfig},
37+
{name: "baidu", factory: newEngine(baidu.New), rawSearchFn: baidu.Search, parseHTMLFn: baidu.ParseHTML, cfg: &config.BaiduConfig},
38+
{name: "bing", factory: newEngine(bing.New), parseHTMLFn: bing.ParseHTML, cfg: &config.BingConfig},
39+
{name: "duckduckgo", aliases: []string{"duck", "ddg"}, factory: newEngine(duckduckgo.New), parseHTMLFn: duckduckgo.ParseHTML, cfg: &config.DuckDuckGoConfig},
40+
{name: "ecosia", factory: newEngine(ecosia.New), rawSearchFn: ecosia.Search, parseHTMLFn: ecosia.ParseHTML, cfg: &config.EcosiaConfig},
41+
}
42+
}
43+
44+
// newEngine adapts a concrete pkg.New (returning *Engine) to the
45+
// core.SearchEngine-typed factory the registry stores.
46+
func newEngine[T core.SearchEngine](ctor func(core.Browser, core.SearchEngineOptions) T) func(core.Browser, core.SearchEngineOptions) core.SearchEngine {
47+
return func(b core.Browser, o core.SearchEngineOptions) core.SearchEngine {
48+
return ctor(b, o)
49+
}
50+
}
51+
52+
// engineValidArgs returns every accepted engine token (canonical names +
53+
// aliases) for cobra's OnlyValidArgs validation.
54+
func engineValidArgs() []string {
55+
specs := engineSpecs()
56+
args := make([]string, 0, len(specs))
57+
for _, s := range specs {
58+
args = append(args, s.name)
59+
args = append(args, s.aliases...)
60+
}
61+
return args
62+
}
63+
64+
// resolveEngineSpec returns the spec whose canonical name or alias matches raw
65+
// (case/space already normalized by the caller), or false when unknown.
66+
func resolveEngineSpec(raw string) (engineSpec, bool) {
67+
for _, s := range engineSpecs() {
68+
if s.name == raw {
69+
return s, true
70+
}
71+
for _, alias := range s.aliases {
72+
if alias == raw {
73+
return s, true
74+
}
75+
}
76+
}
77+
return engineSpec{}, false
78+
}

cmd/root.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717
)
1818

1919
const (
20-
version = "0.8.3"
20+
version = "0.8.4"
2121
defaultConfigFilename = "config"
2222
envPrefix = "OPENSERP"
2323
)
@@ -51,6 +51,7 @@ type ServerConfig struct {
5151
ConfigPath string `mapstructure:"config_path"`
5252
IsDebug bool `mapstructure:"debug"`
5353
IsVerbose bool `mapstructure:"verbose"`
54+
IsQuiet bool `mapstructure:"quiet"`
5455
IsRawRequests bool `mapstructure:"raw_requests"`
5556
Insecure bool `mapstructure:"insecure"`
5657
}
@@ -115,6 +116,7 @@ var flagToConfigKey = map[string]string{
115116
"profiles-json": "app.profiles",
116117
"verbose": "server.verbose",
117118
"debug": "server.debug",
119+
"quiet": "server.quiet",
118120
"head": "app.head",
119121
"leakless": "app.leakless",
120122
"raw": "server.raw_requests",
@@ -154,12 +156,24 @@ var RootCmd = &cobra.Command{
154156
}
155157
config.App.LogFormat = logFormat
156158

157-
core.InitLogger(config.Server.IsVerbose, config.Server.IsDebug, config.App.LogFormat)
159+
// One-shot CLI commands default to quiet so stdout is payload-only.
160+
// Server mode keeps request logs unless server.quiet is set.
161+
quiet := config.Server.IsQuiet
162+
if commandDefaultsToQuiet(cmd) && !cmd.Flags().Changed("quiet") {
163+
quiet = true
164+
}
165+
config.Server.IsQuiet = quiet
166+
167+
core.InitLogger(config.Server.IsVerbose, config.Server.IsDebug, quiet, config.App.LogFormat)
158168
logrus.WithField("config", sanitizedConfigForLog(config)).Debug("Final config")
159169
return nil
160170
},
161171
}
162172

173+
func commandDefaultsToQuiet(cmd *cobra.Command) bool {
174+
return cmd != nil && cmd.Name() != serveCMD.Name()
175+
}
176+
163177
func sanitizedConfigForLog(cfg Config) map[string]interface{} {
164178
return map[string]interface{}{
165179
"server": cfg.Server,
@@ -380,6 +394,7 @@ func setConfigDefaults(v *viper.Viper) {
380394
v.SetDefault("server.port", 7070)
381395
v.SetDefault("server.debug", false)
382396
v.SetDefault("server.verbose", false)
397+
v.SetDefault("server.quiet", false)
383398
v.SetDefault("server.raw_requests", false)
384399
v.SetDefault("server.insecure", false)
385400
v.SetDefault("app.log_format", "")
@@ -436,8 +451,9 @@ func init() {
436451
RootCmd.PersistentFlags().StringVar(&config.App.ProfilesJSON, "profiles", "", "Path to browser profile catalog JSON")
437452
RootCmd.PersistentFlags().BoolVarP(&config.Server.IsVerbose, "verbose", "v", false, "Use verbose output")
438453
RootCmd.PersistentFlags().BoolVarP(&config.Server.IsDebug, "debug", "d", false, "Use debug output. Disable headless browser")
454+
RootCmd.PersistentFlags().BoolVarP(&config.Server.IsQuiet, "quiet", "q", false, "Suppress info logs on stderr (default for CLI commands)")
439455
RootCmd.PersistentFlags().BoolVarP(&config.App.IsBrowserHead, "head", "", false, "Enable browser UI")
440-
RootCmd.PersistentFlags().BoolVarP(&config.App.IsLeakless, "leakless", "l", false, "Use leakless mode to insure browser instances are closed after search")
456+
RootCmd.PersistentFlags().BoolVarP(&config.App.IsLeakless, "leakless", "l", false, "Use leakless mode to ensure browser instances are closed after search")
441457
RootCmd.PersistentFlags().BoolVarP(&config.Server.IsRawRequests, "raw", "r", false, "Disable browser usage, use HTTP requests")
442458
RootCmd.PersistentFlags().BoolVarP(&config.App.IsLeaveHead, "leave", "", false, "Leave browser and tabs opened after search is made")
443459
RootCmd.PersistentFlags().StringVarP(&config.Config2Capcha.ApiKey, "2captcha_key", "", "", "2 captcha api key")

0 commit comments

Comments
 (0)