import logging
import urllib.parse
import bs4
import requests
import collections
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('wb')
ParseResult = collections.namedtuple(
'ParseResult',
(
'BrandName',
'Goods_name',
'Url',
),
)
def parse_block(block):
# logger.info(block)
# logger.info('=' * 100)
url_block = block.select_one('a')
if not url_block:
logger.error('no_url_block')
return
url = url_block.get('href')
if not url:
logger.error('no_href')
return
logger.info('%s', url)
def parse_page(text: str):
# soup = bs4.BeautifulSoup(text, 'Lxml')
soup = bs4.BeautifulSoup(text, "html.parser")
container = soup.select('div.product-item__img')
for block in container:
parse_block(block=block)
class Client:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
self.result = []
def load_page(self):
url = 'https://www.dogeat.ru/catalog/korm/dlya-sobak/'
res = self.session.get(url=url)
return res.text
def pagination_limit(self):
text2 = self.get_page()
soup = bs4.BeautifulSoup(text2, 'html.parser')
container = soup.select('a.pagination')
last_button = container[-1]
href = last_button.get('href')
if not href:
return
r = urllib.parse.urlparse(href)
print(r)
r = urllib.parse.parse_qs(r.query)
return r['p'][0]
print(r)
def run(self):
text = self.load_page()
parse_page(text=text)
if __name__ == '__main__':
parser = Client()
parser.run()