1
1

getmd.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import requests, html2text, re
  2. # 请求头,模拟浏览器UA
  3. headers = {'User-Agent': ' '.join(['Mozilla/5.0 (Windows NT 10.0; Win64; x64; ServiceUI 14)',
  4. 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/70.0.3538.102', 'Safari/537.36','Edge/18.18363']) }
  5. def url_to_markdown(url):
  6. # 发送请求
  7. r = requests.get(url=url, headers=headers)
  8. # Python requests乱码的五种解决办法 https://blog.csdn.net/lilongsy/article/details/122140098
  9. r.encoding = r.apparent_encoding
  10. # html 转换 markdown
  11. html = r.text
  12. text = html2text.html2text(html)
  13. return text
  14. def pull_urls(urls_list):
  15. texts =''
  16. for url in urls_list:
  17. texts += url_to_markdown(url)
  18. return texts
  19. # 正则搜索得到网址URL
  20. def get_url(line):
  21. reg_https = r'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
  22. url = re.search(reg_https, line)
  23. return url
  24. # 检查获得多行URL
  25. def check_urls(urls):
  26. ret = []
  27. lines = urls.split('\n')
  28. for line in lines:
  29. url = get_url(line)
  30. if url is not None:
  31. ret.append(url[0])
  32. ret = list(set(ret))
  33. return ret
  34. def urls_lines(urls_list):
  35. str = '\n'.join(urls_list)
  36. return str
  37. import base64 , hashlib, time
  38. # 构建 PASSKEY
  39. def make_passkey(str=''):
  40. s = time.strftime("%Y%m%d-%H", time.localtime()) + str
  41. b = s.encode("utf-8")
  42. m = hashlib.sha256()
  43. m.update(b)
  44. passkey = base64.b64encode(m.digest()).decode("utf-8")[8:16]
  45. return passkey