!
!
29日是爬取的最后一天数据,存完了还是从爬取当天25日又存了两遍同样的数据,这是怎么回事呀?
能不能从pycharm中写代码抑制它存入2遍或者多遍呀?后面附了代码,(还有有没有大神能教一个简单实用的不间断爬取的函数呀import time然后定义一个函数def round_time():…这种
import re
import time
import requests
import pymysql
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import datetime
conn = pymysql.connect(host='localhost', user='root', passwd='789456', db='test', port=3306, charset='utf8')
cursor=conn.cursor()
url = 'https://tianqi.2345.com/'
html = requests.get(url).text
Pattern = re.compile('{"temp":(.*?)}')
datas = re.findall(Pattern, html)
# fd = open('weather_data.txt', 'w', encoding='utf8')
# fd.write('日期,时间,温度,天气,风向,风级,空气质量\n')
url = 'https://tianqi.2345.com/'
service=Service('C:\Program Files\Google\Chrome\Application\chromedriver_win32\chromedriver.exe')
browser=webdriver.Chrome(service=service)
browser.get(url)
soup=BeautifulSoup(browser.page_source,'lxml')
data_quality=soup.find('div','banner-right-canvas-kq-i clearfix').find_all('i')
print('列表信息')
print(data_quality)
for num in data_quality:
quality=num.get_text()
for line in datas:
data = '"temp":' + line.encode('utf-8').decode('unicode_escape')
tmp = re.findall('"temp":"(.*?)"', data)
weather = re.findall('"weather":"(.*?)"', data)
day = re.findall('"day":"(.*?)"', data)
tm = re.findall('"time_origin_text":"(.*?)"', data)
wind_direction = re.findall('"wind_direction":"(.*?)"', data)
wind_level = re.findall('"wind_level":"(.*?)"', data)
print(day[0], tm[0], tmp[0] + '°', weather[0], wind_direction[0], wind_level[0], quality[0])
# fd.write('{},{},{},{},{},{}\n'.format(day[0], time[0], tmp[0]+'°', weather[0], wind_direction[0], wind_level[0]))
# fd.close()
sql = "INSERT INTO mytable(day,tm,temp,weather,wind,wscale,quality) VALUES ('%s','%s','%s','%s','%s','%s','%s')" % (
day[0], tm[0], tmp[0] + '°', weather[0], wind_direction[0], wind_level[0], quality[0])
#cursor.execute(sql)
cursor.execute(sql)
conn.commit()
conn.close()