import sys import requests from bs4 import BeautifulSoup import csv import urllib3 import os urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def parse_webpage(url): response = requests.get(url, verify=False) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel')) discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else '' # for nestedBiography and biography when single athelete tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography')) # probably not needed, need check table_coach = soup.find('table', class_='biography') unique_keys = set() for table in tables: name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel'))) first_names = [] last_names = [] for name_cell in name_cells: name_parts = name_cell.get_text(strip=True).split() first_names.append(name_parts[0]) last_names.append(' '.join(part for part in name_parts[1:] if part.isupper())) birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel')) birth_place_cell = birth_place.get_text(strip=True) if birth_place else '' country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', )) country_cell = country.get_text(strip=True) if country else '' birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', )) birth_date_cell = birth_date.get_text(strip=True) if birth_date else '' coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel')) if not coach: coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel')) coach_cell = coach.get_text(strip=True) if coach else '' keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0]) unique_keys.add(keys) with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) if os.stat('webpage_data.csv').st_size == 0: writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID']) for keys in unique_keys: writer.writerow(keys) else: print("Failed to fetch webpage") if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python script.py ") else: url = sys.argv[1] parse_webpage(url)