Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

预报爬虫 #17

Open
zhulinpinyu opened this issue Apr 29, 2020 · 0 comments
Open

预报爬虫 #17

zhulinpinyu opened this issue Apr 29, 2020 · 0 comments

Comments

@zhulinpinyu
Copy link
Owner

import requests
from bs4 import BeautifulSoup

def weather(url):
  html_doc = get_raw_html(url)
  data = extract_data(html_doc)
  return data

def get_raw_html(url):
  headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"
  }
  res = requests.get(url, headers=headers)
  return res.text

def extract_data(doc):
  soup = BeautifulSoup(doc, "lxml")
  box = soup.find('div', class_='n-container')
  today_box = box.find('div', class_='c-left')
  week_box = box.find('div', class_='c-right')
  box_24h = soup.find('div', class_='cleft-24hours')

  #current
  location = box.select_one('a')['cname']
  temp = today_box.select_one('span.num').contents[0]
  desc = today_box.select_one('p.text').string
  date = today_box.select_one('a.date').string.strip()
  wind = today_box.select_one('span.wind').contents[2]
  hundity = today_box.select_one('span.hundity').contents[2]
  aqi = today_box.select_one('span.liv-text > a > em').string
  aq = today_box.select_one('span.liv-text > a > span.liv-img').string
  updated_at = today_box.select_one('div.row4 > p').string

  #7d
  dates = [tag.string for tag in week_box.findAll('p', class_='date')]
  texts = [tag.string for tag in week_box.findAll('p', class_='text')]
  deses = [tag.string for tag in week_box.findAll('p', class_='des')]
  winds = [tag.string for tag in week_box.findAll('p', class_='wind')]
  maxTemps = week_box.select_one('div.r-temp')['data-high'].split(",")
  minTemps = week_box.select_one('div.r-temp')['data-low'].split(",")
  forecast7d = list(
    map(
      lambda tup: {
        'date': tup[0],
        'text': tup[1],
        'des': tup[2],
        'maxTemp': tup[3],
        'minTemp': tup[4],
        'wind': tup[5],
      },
      zip(dates,texts,deses,maxTemps,minTemps,winds)
    )
  )

  #24h
  times = [tag.string for tag in box_24h.findAll('p', class_='time')]
  temps = [tag.string for tag in box_24h.findAll('p', class_='temp')]
  forecast24h = list(
    map(
      lambda tup: {
        'time': tup[0],
        'temp': tup[1]
      },
      zip(times, temps)
    )
  )

  return {
    'current': {
      '位置': location,
      '温度': temp,
      '天气': desc,
      'date': date,
      '风力': wind,
      '湿度': hundity,
      'AQI': aqi,
      '空气质量': aq,
      'updated_at': updated_at
    },
    'forecast7d': forecast7d,
    'forecast24h': forecast24h
  }


data = weather("http://tianqi.sogou.com/pc/weather/2332634")
print(data['current']['date'])
print(data['current']['位置'], data['current']['天气'], data['current']['温度']+'°', data['current']['湿度'], data['current']['风力'])
print('AQI: '+data['current']['AQI'])
print('空气质量: ' + data['current']['空气质量'])
print(data['current']['updated_at'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant