WAP网站是啥?wap是移动端还是手机端?python百度下拉框关键词采集源码:
06 | def get_keywords(word): |
07 | url = f "https://www.baidu.com/sugrec?pre=1&ie=utf-8&json=1&prod=pc&wd={word}" |
08 | html = requests.get(url) |
13 | for key_word in html[ 'g' ]: |
15 | key_words.append(key_word[ 'q' ]) |
20 | url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=%s&sugmode=2&json=1&p=3&sid=1427_21091_21673_22581&req=2&pbs=%%E5%%BF%%AB%%E6%%89%%8B&csor=2&pwd=%%E5%%BF%%AB%%E6%%89%%8B&cb=jQuery11020924966752020363_1498055470768&_=1498055470781' % word |
21 | r = requests.get(url, verify = False ) |
23 | res = cont[ 41 : - 2 ].decode( 'gbk' ) |
24 | res_json = json.loads(res) |
28 | url = f 'http://suggestion.baidu.com/su?wd={word}&sugmode=3&json=1' |
29 | html = requests.get(url).text |
30 | html = html.replace( "window.baidu.sug(" ,'') |
31 | html = html.replace( ")" , '') |
32 | html = html.replace( ";" , '') |
34 | html = json.loads(html) |
40 | opencsv = open ( 'word.csv' , 'a+' ) |
41 | for word in open ( 'gjc.txt' ,encoding = 'utf-8' ): |
42 | print (urllib.parse.quote_plus(word)) |
43 | url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=%s&sugmode=2&json=1&p=3&sid=1427_21091_21673_22581&req=2' % urllib.parse.quote_plus(word) |
44 | html = requests.get(url).text |
45 | html = html.replace( 'window.baidu.sug(' ,'') |
46 | html = html.replace( ');' ,'') |
52 | opencsv.write( '%s\n' % i) |
54 | def get_more_word(word): |
56 | for i in 'abcdefghijklmnopqrstuvwxyz' : |
57 | more_word.extend(get_keywords( '%s%s' % (word,i))) |
60 | print ( len ( list ( set (more_word)))) |
61 | return list ( set (more_word)) |
64 | def get_more_sug(word): |
66 | for i in 'abcdefghijklmnopqrstuvwxyz' : |
67 | all_words + = get_sug(word + i) |
68 | print ( len ( list ( set (all_words)))) |
69 | return list ( set (all_words)) |
|
提供多种python百度下拉框关键词采集方式,基于百度API接口实现,可导出到Excel表格,本文提供4个采集函数及两个汇总函数,根据自己的需求灵活使用。