77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
import requests
|
|
from zipfile import ZipFile
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
|
|
def get_doc(link):
|
|
resp = requests.get(link)
|
|
soup = BeautifulSoup(resp.content, 'html.parser')
|
|
|
|
t = soup.findAll('a', href=True, string='Descarga XML/XSD')[0]
|
|
fname = t['href'].split('/')[-1].split('.')[0]
|
|
table = soup.find_all('table')
|
|
|
|
partial = []
|
|
result = {"sections": []}
|
|
curr = ''
|
|
|
|
for r in table[0].find_all('tr')[1:]:
|
|
props = [c.text.strip() for c in r.find_all('td')]
|
|
filtered = [p for p in props]
|
|
|
|
if len(filtered) > 1 and filtered[0]:
|
|
partial.append({'name': filtered[0], 'text': filtered[-1]})
|
|
|
|
if len(filtered) == 1:
|
|
if curr:
|
|
result['sections'].append({'name': curr, 'entries': partial[:]})
|
|
partial = []
|
|
curr = filtered[0].lower()
|
|
|
|
if curr:
|
|
result['sections'].append({'name': curr, 'entries': partial[:]})
|
|
|
|
with open(fname.replace('XML', '') + '.json', 'w') as f:
|
|
json.dump(result, f, indent=2)
|
|
|
|
|
|
def get_schema(driver, link):
|
|
driver.get(link)
|
|
sec = driver.find_element(By.LINK_TEXT, 'Descarga XML/XSD')
|
|
zipfile = sec.get_attribute('href')
|
|
response = requests.get(zipfile)
|
|
if response.status_code == 200:
|
|
fname = zipfile.split('/')[-1]
|
|
with open(fname, 'wb') as file:
|
|
file.write(response.content)
|
|
with ZipFile(fname, 'r') as f:
|
|
f.extractall()
|
|
print('File downloaded successfully!')
|
|
else:
|
|
print("Failed to download the file.")
|
|
|
|
|
|
ll = 'https://siatinfo.impuestos.gob.bo/index.php/facturacion-en-linea/archivos-xml-xsd-de-facturas-electronicas/factura-comercial-de-exportacion-de-servicios'
|
|
|
|
get_doc(ll)
|
|
|
|
'''
|
|
with webdriver.Chrome() as driver:
|
|
driver.get('https://siatinfo.impuestos.gob.bo/index.php/sistema-facturacion')
|
|
sec = driver.find_element(By.XPATH, '//*[@id="item-195"]/ul')
|
|
links = [aa.get_attribute('href') for aa in sec.find_elements(By.TAG_NAME, "a")]
|
|
|
|
for link in links:
|
|
if not link.startswith('http'):
|
|
break
|
|
|
|
try:
|
|
# get_schema(driver, link)
|
|
get_doc(link)
|
|
except:
|
|
print(f'failed {link}')
|
|
''' |