python實(shí)現(xiàn)自動采集VIP課程帖子中的百度網(wǎng)盤鏈接

請我喝茶 · 發(fā)表于 2020-5-5 13:48:08

本帖最后由請我喝茶于 2020-5-5 14:15 編輯

# -*- coding: utf-8 -*-

# Author: 桑葚ICE
# Email: 152516cc@gmail.com
# Blog: iicey.github.io
# JueJin: juejin.im/user/5c64dce8e51d45013c40742c
import re
import time

import requests
from scrapy import Selector

class Spider:

def __init__(self, un="", pwd="", cookie=""):
      self.un = un
      self.pwd = pwd
      self.cookies = {}
      self.headers = {"cookie": cookie}
      self.bd_url_info = {}

def enter(self):
      params = {
         "mod": "logging",
         "action": "login",
         "loginsubmit": "yes",
         "infloat": "yes",
         "lssubmit": "yes",
         "inajax": "1"
      }
      data = {
         "fastloginfield": "username",
         "username": self.un,
         "password": self.pwd,
         "quickforward": "yes",
         "handlekey": "ls",
      }
      response = requests.post(
         'http://rigasin.com/member.php',
         headers=self.headers,
         params=params,
         data=data
      )
      self.cookies = requests.utils.dict_from_cookiejar(response.cookies)

def fid_to_tid(self, fid, page=1, tid_s=None):
      """
      :param fid:
      :param page:
      :return:
      """
      params = (
         ('mod', 'forumdisplay'),
         ('fid', fid),
         ('page', page),
         ('t', '5104641'),
      )

      response = requests.get(
         'http://rigasin.com/forum.php',
         headers=self.headers,
         # cookies=self.cookies,
         params=params
      )
      ret = Selector(response)
      tid_l = ret.xpath('//*[@id="threadlisttableid"]/tbody[contains(@id,"normalthread")]/@id').extract()
      if not tid_s:
         tid_s = set()
      old_count = len(tid_s)
      for i in tid_l:
         tid_s.add(i.replace("normalthread_", ""))
      new_count = len(tid_s)
      if new_count != old_count:
         page += 1
         return self.fid_to_tid(fid, page=page, tid_s=tid_s)
      else:
         return tid_s

def get_content(self, fid, tid):
      response = requests.get(
         f"http://rigasin.com/forum.php?mod=viewthread&tid={tid}&extra=page%3D1",
         headers=self.headers
      )
      if "如果您要查看本帖隱藏內(nèi)容請" in response.text:
         self.post_content(fid, tid)
         time.sleep(60)
         return self.get_content(fid, tid)
      else:
         # time.sleep(1)
         result = Selector(response)
         url_info = result.xpath('//div[@class="showhide"]//text()').extract()
         try:
            bd_url = [i for i in url_info if "https://" in i][0]
            bd_pwd = re.findall(r"\w\w\w\w", [i for i in url_info if "提取碼" in i][0])[0]

            print(bd_url, bd_pwd)
            self.bd_url_info[bd_url] = bd_pwd
         except IndexError as e:
            print(e, response.url)

def post_content(self, fid, tid):
      """
      :param fid:
      :param tid:
      :return:
      """
      params = (
         ('mod', 'post'),
         ('infloat', 'yes'),
         ('action', 'reply'),
         ('fid', fid),
         ('extra', ''),
         ('tid', tid),
         ('replysubmit', 'yes'),
         ('inajax', '1'),
      )

      data = {
         'formhash': '78484d61',
         'handlekey': 'reply',
         'noticeauthor': '',
         'noticetrimstr': '',
         'noticeauthormsg': '',
         'usesig': '0',
         'subject': '',
         'message': '666'
      }

      response = requests.post('http://rigasin.com/forum.php',
                              headers=self.headers, params=params, data=data)

def main(self):
      pass

if __name__ == '__main__':
cookie = "瀏覽器F12打開取出cookie放到這里"
spider = Spider(cookie=cookie)
for fid in [158]:
      for tid in spider.fid_to_tid(fid):
         spider.get_content(fid, tid)
print(spider.bd_url_info)

ou315001655 · 發(fā)表于 2020-5-5 23:46:27

沒回復(fù)的可以取到嗎？

請我喝茶 · 發(fā)表于 2020-5-6 11:08:05

ou315001655 發(fā)表于 2020-5-5 23:46
沒回復(fù)的可以取到嗎？

不能，不回復(fù)就想拿內(nèi)容可以學(xué)滲透然后自己搞

請我喝茶 · 發(fā)表于 2020-5-6 11:09:26

ou315001655 發(fā)表于 2020-5-5 23:46
沒回復(fù)的可以取到嗎？

這個(gè)你睡前或者看視頻的時(shí)候掛起來讓它跑起來就OK，不用管他

qwertyuiop1822 · 發(fā)表于 2020-8-9 00:25:00

就是300秒內(nèi)不能重復(fù)發(fā)言比較惡心，所以想都回復(fù)一下，以后搜就可以直接看了。保存那么多到百度云也沒用

fafa100 · 發(fā)表于 2020-8-27 08:10:08

zlmzygx8 · 發(fā)表于 2020-9-22 20:40:48

ilike · 發(fā)表于 2020-11-24 18:20:36

謝謝分享

415194510 · 發(fā)表于 2022-2-1 00:03:48

pgone · 發(fā)表于 2022-7-11 02:10:24

感謝樓主的無私分享！

		自動登錄	找回密碼
密碼			注冊成為正式會員

python實(shí)現(xiàn)自動采集VIP課程帖子中的百度網(wǎng)盤鏈接

終身VIP會員