Как получить id с сайта
Я пытаюсь парсить сайт ozon, код делится на 2 этапа main и funk, в funk есть функция которая открывает страницу определенного объекта, и она должна брать id c этой страницы, но вместо этого он открывает страницу about и заканчивает работу, что делать? funk:
import time as tm
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
def page_down(driver):
driver.execute_script('''
const scrollStep = 200; // Размер шага прокрутки (в пикселях)
const scrollInterval = 100; // Интервал между шагами (в миллисекундах)
const scrollHeight = document.documentElement.scrollHeight;
let currentPosition = 0;
const interval = setInterval(() => {
window.scrollBy(0, scrollStep);
currentPosition += scrollStep;
if (currentPosition >= scrollHeight) {
clearInterval(interval);
}
}, scrollInterval);
''')
def collect_product_info(driver, url=''):
driver.switch_to.new_window('tab')
tm.sleep(3)
driver.get(url=url)
tm.sleep(3)
# product_id
tm.sleep(10)
product_id = driver.find_element(
By.XPATH, '//div[contains(text(), "Артикул: ")]'
).text.split('Артикул: ')[1]
# print(product_id)
page_source = str(driver.page_source)
soup = BeautifulSoup(page_source, 'lxml')
with open(f'product_{product_id}.html', 'w') as file:
file.write(page_source)
product_name = soup.find('div', attrs={"data-widget": 'webProductHeading'}).find(
'h1').text.strip().replace('\t', '').replace('\n', ' ')
# product_id
# try:
# product_id = soup.find('div', string=re.compile(
# 'Артикул:')).text.split('Артикул: ')[1].strip()
# except:
# product_id = None
# product statistic
try:
product_statistic = soup.find(
'div', attrs={"data-widget": 'webSingleProductScore'}).text.strip()
if " • " in product_statistic:
product_stars = product_statistic.split(' • ')[0].strip()
product_reviews = product_statistic.split(' • ')[1].strip()
else:
product_statistic = product_statistic
except:
product_statistic = None
product_stars = None
product_reviews = None
# product price
try:
ozon_card_price_element = soup.find(
'span', string="c Ozon Картой").parent.find('div').find('span')
product_ozon_card_price = ozon_card_price_element.text.strip(
) if ozon_card_price_element else ''
price_element = soup.find(
'span', string="без Ozon Карты").parent.parent.find('div').findAll('span')
product_discount_price = price_element[0].text.strip(
) if price_element[0] else ''
product_base_price = price_element[1].text.strip(
) if price_element[1] is not None else ''
except:
product_ozon_card_price = None
product_discount_price = None
product_base_price = None
# product price
try:
ozon_card_price_element = soup.find(
'span', string="c Ozon Картой").parent.find('div').find('span')
except AttributeError:
card_price_div = soup.find(
'div', attrs={"data-widget": "webPrice"}).findAll('span')
product_base_price = card_price_div[0].text.strip()
product_discount_price = card_price_div[1].text.strip()
product_data = (
{
'product_id': product_id,
'product_name': product_name,
'product_ozon_card_price': product_ozon_card_price,
'product_discount_price': product_discount_price,
'product_base_price': product_base_price,
'product_statistic': product_statistic,
'product_stars': product_stars,
'product_reviews': product_reviews,
}
)
driver.close()
driver.switch_to.window(driver.window_handles[0])
return product_data
main:
import json
import time
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from functions import page_down, collect_product_info
def get_products_links(item_name='клавиатура для компьютера механика'):
driver = uc.Chrome()
driver.implicitly_wait(5)
driver.get(url='https://www.ozon.ru/')
time.sleep(2)
find_input = driver.find_element(By.NAME, 'text')
find_input.clear()
find_input.send_keys(item_name)
time.sleep(2)
find_input.send_keys(Keys.ENTER)
time.sleep(2)
current_url = f'{driver.current_url}&sorting=rating'
driver.get(url=current_url)
time.sleep(2)
# page_down(driver=driver)
time.sleep(2)
try:
find_links = driver.find_elements(By.CLASS_NAME, 'tsBody500Medium')
products_urls = list(set([f'{link.get_attribute("href")}' for link in find_links]))
print('[+] Ссылки на товары собраны!')
except:
print('[!] Что-то сломалось при сборе ссылок на товары!')
products_urls_dict = {}
for k, v in enumerate(products_urls):
products_urls_dict.update({k: v})
with open('products_urls_dict.json', 'w', encoding='utf-8') as file:
json.dump(products_urls_dict, file, indent=4, ensure_ascii=False)
time.sleep(2)
products_data = []
for url in products_urls:
data = collect_product_info(driver=driver, url=url)
print(f'[+] Собрал данные товара с id: {data.get("product_id")}')
time.sleep(2)
products_data.append(data)
with open('PRODUCTS_DATA.json', 'w', encoding='utf-8') as file:
json.dump(products_data, file, indent=4, ensure_ascii=False)
driver.close()
driver.quit()
def main():
print('[INFO] Сбор данных начался. Пожалуйста ожидайте...')
get_products_links(item_name='клавиатура для компьютера механика')
print('[INFO] Работа выполнена успешно!')
if __name__ == '__main__':
main()