|
import * as cheerio from 'cheerio'; |
|
import { httpRequest } from '../utils/utils'; |
|
|
|
const EndPoints = { |
|
WEB: 'https://www.sogou.com/web', |
|
LINK: 'https://www.sogou.com/' |
|
}; |
|
|
|
const SELECTOR = { |
|
results: '#main .results', |
|
resultTitle: '.vr-title', |
|
resultLink: '.vr-title > a', |
|
resultSnippet: ['.star-wiki', '.fz-mid', '.attribute-centent'], |
|
resultSnippetExcluded: ['.text-lightgray', '.zan-box', '.tag-website'], |
|
related: '#main .vrwrap.middle-better-hintBox .hint-mid' |
|
}; |
|
|
|
export class Sogou { |
|
private query: string; |
|
private $: cheerio.CheerioAPI; |
|
|
|
constructor(query: string) { |
|
this.query = query; |
|
} |
|
|
|
async init() { |
|
const res = await this.search(this.query); |
|
this.$ = cheerio.load(res); |
|
} |
|
|
|
private async search(query: string) { |
|
try { |
|
const res = await httpRequest({ |
|
endpoint: EndPoints.WEB, |
|
query: { |
|
query, |
|
} |
|
}); |
|
const result = await res.text(); |
|
return result; |
|
} catch(err) { |
|
return ''; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
public getRelatedQueries() { |
|
const $related = this.$(SELECTOR.related); |
|
const texts = $related.children().map((i, el) => { |
|
return this.$(el).text(); |
|
}); |
|
return texts.toArray(); |
|
} |
|
|
|
public async getResults() { |
|
const $result = this.$(SELECTOR.results); |
|
const nodes = $result.children().map((i, el) => { |
|
return this.resultFilter(el); |
|
}); |
|
const list = await Promise.all(nodes); |
|
return list.filter(item => item.url && item.snippet && item.name); |
|
} |
|
|
|
private async resultFilter(el: cheerio.Element) { |
|
const $el = this.$(el); |
|
const title = $el.find(SELECTOR.resultTitle).text().replace(/\n/g, '').trim(); |
|
const link = $el.find(SELECTOR.resultLink).get(0)?.attribs.href; |
|
const snippets = SELECTOR.resultSnippet.map(item => { |
|
SELECTOR.resultSnippetExcluded.forEach(e => { |
|
$el.find(e).remove(); |
|
}); |
|
return $el.find(item).text().trim() ?? ''; |
|
}); |
|
let url = link; |
|
if (!link?.includes('http')) url = await this.getRealTargetUrl(link); |
|
return { |
|
name: title, |
|
url, |
|
snippet: snippets.join(''), |
|
}; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
private async getRealTargetUrl(url?: string) { |
|
if (!url) return ''; |
|
const res = await httpRequest({ |
|
endpoint: EndPoints.LINK + url, |
|
headers: { |
|
Accept: '*/*' |
|
} |
|
}); |
|
const txt = await res.text(); |
|
const $ = cheerio.load(txt); |
|
|
|
const link = $('script').text(); |
|
const matches = link.match(/"(.*?)"/); |
|
return matches?.[1] || ''; |
|
} |
|
} |
|
|