from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By import requests from zipfile import ZipFile from bs4 import BeautifulSoup import json def get_doc(link): resp = requests.get(link) soup = BeautifulSoup(resp.content, 'html.parser') t = soup.findAll('a', href=True, string='Descarga XML/XSD')[0] fname = t['href'].split('/')[-1].split('.')[0] table = soup.find_all('table') partial = [] result = {"sections": []} curr = '' for r in table[0].find_all('tr')[1:]: props = [c.text.strip() for c in r.find_all('td')] filtered = [p for p in props] if len(filtered) > 1 and filtered[0]: partial.append({'name': filtered[0], 'text': filtered[-1]}) if len(filtered) == 1: if curr: result['sections'].append({'name': curr, 'entries': partial[:]}) partial = [] curr = filtered[0].lower() if curr: result['sections'].append({'name': curr, 'entries': partial[:]}) with open(fname.replace('XML', '') + '.json', 'w') as f: json.dump(result, f, indent=2) def get_schema(driver, link): driver.get(link) sec = driver.find_element(By.LINK_TEXT, 'Descarga XML/XSD') zipfile = sec.get_attribute('href') response = requests.get(zipfile) if response.status_code == 200: fname = zipfile.split('/')[-1] with open(fname, 'wb') as file: file.write(response.content) with ZipFile(fname, 'r') as f: f.extractall() print('File downloaded successfully!') else: print("Failed to download the file.") ll = 'https://siatinfo.impuestos.gob.bo/index.php/facturacion-en-linea/archivos-xml-xsd-de-facturas-electronicas/factura-comercial-de-exportacion-de-servicios' get_doc(ll) ''' with webdriver.Chrome() as driver: driver.get('https://siatinfo.impuestos.gob.bo/index.php/sistema-facturacion') sec = driver.find_element(By.XPATH, '//*[@id="item-195"]/ul') links = [aa.get_attribute('href') for aa in sec.find_elements(By.TAG_NAME, "a")] for link in links: if not link.startswith('http'): break try: # get_schema(driver, link) get_doc(link) except: print(f'failed {link}') '''