本文共 2475 字,大约阅读时间需要 8 分钟。
import requestsimport reimport csvheaders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Cookie':'testcookie=yes; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1619717328; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1619717328; JJEVER=%7B%22fenzhan%22%3A%22noyq%22%7D; smidV2=20210430012854effd865c944ddc429b0c481dfef3f31d0035c72a77d581610'}class ximalaiyaSpider: def getSource(self): # 获取url数据 # 目标url url = 'https://www.ximalaya.com/channel/7/' resp = requests.get(url, headers=headers) resp.encoding='utf-8' # print(resp.content.decode('utf-8')) return resp.text def parseSource(self): content =self.getSource() r =re.match(r'.*?(<ul class="_qt">.*?</ul>).*?',content,re.S) # print(r.group(1)) # #<a class="album-title line-2 lg bold kF_" title="摸金天师(紫襟演播)" href="/youshengshu/4756811/"><span class="album-tag kF_"><i class="xuicon xuicon-wanben album-tag-icon kF_"></i></span><span class="v-m kF_">摸金天师(紫襟演播)</span></a> a =r.group(1) a_all=re.findall(r'<a class="album-title line-2 lg bold kF_" title=.*?</a>',a,re.S) # print(a_all) #<a class="album-title line-2 lg bold kF_" title="摸金天师(紫襟演播)" href="/youshengshu/4756811/"><span class="album-tag kF_"><i class="xuicon xuicon-wanben album-tag-icon kF_"></i></span><span class="v-m kF_">摸金天师(紫襟演播)</span></a>' a_titleall=[] pattern=re.compile(r'<a class="album-title line-2 lg bold kF_" title="(.*?)" href="/(.*?)"><span.*?>.*?</span></a>',re.S) for i in a_all: onetitle =pattern.match(i) # print(type(onetitle.group(1)))#摸金天师(紫襟演播) # print(onetitle.group(2)) # #[https://www.ximalaya.com/]这段没有要后期拼接url哦,这里group(2)结果是/youshengshu/4756811/ a_titleone=[onetitle.group(1),'https://www.ximalaya.com/'+onetitle.group(2)] # print(a_titleone)#['"摸金天师(紫襟演播)" ', 'https://www.ximalaya.com/youshengshu/4756811/'] a_titleall.append(a_titleone) return a_titleall def saveData(self): content=self.parseSource() # 写入csv with open('喜马来雅.csv','w',encoding='utf-8',newline='')as f: writer=csv.writer(f) header1=["作品",'链接'] writer.writerow(header1) writer.writerows(content)def main(): ximalaiyaSpider().saveData()if __name__ == '__main__': main()
转载地址:http://brun.baihongyu.com/