ice_skating_parsing/script.py

import sys
import requests
from bs4 import BeautifulSoup
import csv
import urllib3
import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def parse_webpage(url):
    response = requests.get(url, verify=False)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel'))
        discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else ''
        # for nestedBiography and biography when single athelete
        tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography'))
        # probably not needed, need check
        table_coach = soup.find('table', class_='biography')

        unique_keys = set()

        for table in tables:
            name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel')))
            first_names = []
            last_names = []
            for name_cell in name_cells:
                name_parts = name_cell.get_text(strip=True).split()
                first_names.append(name_parts[0])
                last_names.append(' '.join(part for part in name_parts[1:] if part.isupper()))

            birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel'))
            birth_place_cell = birth_place.get_text(strip=True) if birth_place else ''

            country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', ))
            country_cell = country.get_text(strip=True) if country else ''

            birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', ))
            birth_date_cell = birth_date.get_text(strip=True) if birth_date else ''

            coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
            if not coach:
                coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))

            coach_cell = coach.get_text(strip=True) if coach else ''

            keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0])
            unique_keys.add(keys)

        with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            if os.stat('webpage_data.csv').st_size == 0:
                writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID'])
            for keys in unique_keys:
                writer.writerow(keys)

    else:
        print("Failed to fetch webpage")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <url>")
    else:
        url = sys.argv[1]
        parse_webpage(url)
first commit 2024-05-04 14:25:31 +02:00			`import sys`
			`import requests`
			`from bs4 import BeautifulSoup`
			`import csv`
			`import urllib3`
			`import os`

			`urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)`

			`def parse_webpage(url):`
			`response = requests.get(url, verify=False)`

			`if response.status_code == 200:`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel'))`
			`discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else ''`
			`# for nestedBiography and biography when single athelete`
			`tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography'))`
			`# probably not needed, need check`
			`table_coach = soup.find('table', class_='biography')`

			`unique_keys = set()`

			`for table in tables:`
			`name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel')))`
			`first_names = []`
			`last_names = []`
			`for name_cell in name_cells:`
			`name_parts = name_cell.get_text(strip=True).split()`
			`first_names.append(name_parts[0])`
			`last_names.append(' '.join(part for part in name_parts[1:] if part.isupper()))`

			`birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel'))`
			`birth_place_cell = birth_place.get_text(strip=True) if birth_place else ''`

			`country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', ))`
			`country_cell = country.get_text(strip=True) if country else ''`

			`birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', ))`
			`birth_date_cell = birth_date.get_text(strip=True) if birth_date else ''`

			`coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))`
			`if not coach:`
			`coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))`

			`coach_cell = coach.get_text(strip=True) if coach else ''`

			`keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0])`
			`unique_keys.add(keys)`

			`with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile:`
			`writer = csv.writer(csvfile)`
			`if os.stat('webpage_data.csv').st_size == 0:`
			`writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID'])`
			`for keys in unique_keys:`
			`writer.writerow(keys)`

			`else:`
			`print("Failed to fetch webpage")`

			`if __name__ == "__main__":`
			`if len(sys.argv) != 2:`
			`print("Usage: python script.py <url>")`
			`else:`
			`url = sys.argv[1]`
			`parse_webpage(url)`