ice_skating_parsing/script.py
2024-05-04 14:25:31 +02:00

67 lines
2.8 KiB
Python

import sys
import requests
from bs4 import BeautifulSoup
import csv
import urllib3
import os
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def parse_webpage(url):
response = requests.get(url, verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel'))
discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else ''
# for nestedBiography and biography when single athelete
tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography'))
# probably not needed, need check
table_coach = soup.find('table', class_='biography')
unique_keys = set()
for table in tables:
name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel')))
first_names = []
last_names = []
for name_cell in name_cells:
name_parts = name_cell.get_text(strip=True).split()
first_names.append(name_parts[0])
last_names.append(' '.join(part for part in name_parts[1:] if part.isupper()))
birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel'))
birth_place_cell = birth_place.get_text(strip=True) if birth_place else ''
country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', ))
country_cell = country.get_text(strip=True) if country else ''
birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', ))
birth_date_cell = birth_date.get_text(strip=True) if birth_date else ''
coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
if not coach:
coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
coach_cell = coach.get_text(strip=True) if coach else ''
keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0])
unique_keys.add(keys)
with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
if os.stat('webpage_data.csv').st_size == 0:
writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID'])
for keys in unique_keys:
writer.writerow(keys)
else:
print("Failed to fetch webpage")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <url>")
else:
url = sys.argv[1]
parse_webpage(url)