first commit

2024-05-04 14:25:31 +02:00 · 2024-05-04 14:25:31 +02:00 · 6e2e743659
commit 6e2e743659
5 changed files with 95 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 .env/
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
 Install:
 ```
 python -m venv -p python39 .env
 source .env/bin/activate
 ```
 Usage:
 ```
 python script.py URL
 ```
 Tested urls:
 OK:
 https://www.isuresults.com/bios/isufs00103105.htm
 https://www.isuresults.com/bios/isufs00103105.htm
 No:
 https://www.isuresults.com/bios/isufs00103105.htm
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 requests
 beautifulsoup4
--- a/script.py
+++ b/script.py
@ -0,0 +1,67 @@
 import sys
 import requests
 from bs4 import BeautifulSoup
 import csv
 import urllib3
 import os
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def parse_webpage(url):
    response = requests.get(url, verify=False)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel'))
        discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else ''
        # for nestedBiography and biography when single athelete
        tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography'))
        # probably not needed, need check
        table_coach = soup.find('table', class_='biography')
        unique_keys = set()
        for table in tables:
            name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel')))
            first_names = []
            last_names = []
            for name_cell in name_cells:
                name_parts = name_cell.get_text(strip=True).split()
                first_names.append(name_parts[0])
                last_names.append(' '.join(part for part in name_parts[1:] if part.isupper()))
            birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel'))
            birth_place_cell = birth_place.get_text(strip=True) if birth_place else ''
            country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', ))
            country_cell = country.get_text(strip=True) if country else ''
            birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', ))
            birth_date_cell = birth_date.get_text(strip=True) if birth_date else ''
            coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
            if not coach:
                coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
            coach_cell = coach.get_text(strip=True) if coach else ''
            keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0])
            unique_keys.add(keys)
        with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            if os.stat('webpage_data.csv').st_size == 0:
                writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID'])
            for keys in unique_keys:
                writer.writerow(keys)
    else:
        print("Failed to fetch webpage")
 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <url>")
    else:
        url = sys.argv[1]
        parse_webpage(url)
--- a/webpage_data.csv
+++ b/webpage_data.csv
@ -0,0 +1,4 @@
 First Name,Last Name,Birth Place,Country,Birth Date,Type,Coach,ID
 Karolina,CALHOUN,Long Beach USA,BRA,29.10.1999,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105
 Michael,VALDEZ,Pasadena USA,BRA,06.08.1997,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105
 Mirai,NAGASU,"Montebello, CA",USA,16.04.1993,LADIES,Tom Zakrajsek,isufs00010220