first commit

2024-05-04 14:25:31 +02:00 · 2024-05-04 14:25:31 +02:00 · 6e2e743659
commit 6e2e743659
5 changed files with 95 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.env/
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
+Install:
+
+```
+python -m venv -p python39 .env
+source .env/bin/activate
+```
+
+Usage:
+
+```
+python script.py URL
+```
+
+Tested urls:
+
+OK:
+https://www.isuresults.com/bios/isufs00103105.htm
+https://www.isuresults.com/bios/isufs00103105.htm
+
+No:
+https://www.isuresults.com/bios/isufs00103105.htm
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests
+beautifulsoup4
--- a/script.py
+++ b/script.py
@ -0,0 +1,67 @@
+import sys
+import requests
+from bs4 import BeautifulSoup
+import csv
+import urllib3
+import os
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+def parse_webpage(url):
+    response = requests.get(url, verify=False)
+
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel'))
+        discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else ''
+        # for nestedBiography and biography when single athelete
+        tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography'))
+        # probably not needed, need check
+        table_coach = soup.find('table', class_='biography')
+
+        unique_keys = set()
+
+        for table in tables:
+            name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel')))
+            first_names = []
+            last_names = []
+            for name_cell in name_cells:
+                name_parts = name_cell.get_text(strip=True).split()
+                first_names.append(name_parts[0])
+                last_names.append(' '.join(part for part in name_parts[1:] if part.isupper()))
+
+            birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel'))
+            birth_place_cell = birth_place.get_text(strip=True) if birth_place else ''
+
+            country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', ))
+            country_cell = country.get_text(strip=True) if country else ''
+
+            birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', ))
+            birth_date_cell = birth_date.get_text(strip=True) if birth_date else ''
+
+            coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
+            if not coach:
+                coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
+
+            coach_cell = coach.get_text(strip=True) if coach else ''
+
+            keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0])
+            unique_keys.add(keys)
+
+        with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile:
+            writer = csv.writer(csvfile)
+            if os.stat('webpage_data.csv').st_size == 0:
+                writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID'])
+            for keys in unique_keys:
+                writer.writerow(keys)
+
+    else:
+        print("Failed to fetch webpage")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <url>")
+    else:
+        url = sys.argv[1]
+        parse_webpage(url)
--- a/webpage_data.csv
+++ b/webpage_data.csv
@ -0,0 +1,4 @@
+First Name,Last Name,Birth Place,Country,Birth Date,Type,Coach,ID
+Karolina,CALHOUN,Long Beach USA,BRA,29.10.1999,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105
+Michael,VALDEZ,Pasadena USA,BRA,06.08.1997,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105
+Mirai,NAGASU,"Montebello, CA",USA,16.04.1993,LADIES,Tom Zakrajsek,isufs00010220