pfs-invoices/utils/scrapper/scrapper.py
2024-05-04 21:54:48 -04:00

77 lines
2.3 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import requests
from zipfile import ZipFile
from bs4 import BeautifulSoup
import json
def get_doc(link):
resp = requests.get(link)
soup = BeautifulSoup(resp.content, 'html.parser')
t = soup.findAll('a', href=True, string='Descarga XML/XSD')[0]
fname = t['href'].split('/')[-1].split('.')[0]
table = soup.find_all('table')
partial = []
result = {"sections": []}
curr = ''
for r in table[0].find_all('tr')[1:]:
props = [c.text.strip() for c in r.find_all('td')]
filtered = [p for p in props]
if len(filtered) > 1 and filtered[0]:
partial.append({'name': filtered[0], 'text': filtered[-1]})
if len(filtered) == 1:
if curr:
result['sections'].append({'name': curr, 'entries': partial[:]})
partial = []
curr = filtered[0].lower()
if curr:
result['sections'].append({'name': curr, 'entries': partial[:]})
with open(fname.replace('XML', '') + '.json', 'w') as f:
json.dump(result, f, indent=2)
def get_schema(driver, link):
driver.get(link)
sec = driver.find_element(By.LINK_TEXT, 'Descarga XML/XSD')
zipfile = sec.get_attribute('href')
response = requests.get(zipfile)
if response.status_code == 200:
fname = zipfile.split('/')[-1]
with open(fname, 'wb') as file:
file.write(response.content)
with ZipFile(fname, 'r') as f:
f.extractall()
print('File downloaded successfully!')
else:
print("Failed to download the file.")
ll = 'https://siatinfo.impuestos.gob.bo/index.php/facturacion-en-linea/archivos-xml-xsd-de-facturas-electronicas/factura-comercial-de-exportacion-de-servicios'
get_doc(ll)
'''
with webdriver.Chrome() as driver:
driver.get('https://siatinfo.impuestos.gob.bo/index.php/sistema-facturacion')
sec = driver.find_element(By.XPATH, '//*[@id="item-195"]/ul')
links = [aa.get_attribute('href') for aa in sec.find_elements(By.TAG_NAME, "a")]
for link in links:
if not link.startswith('http'):
break
try:
# get_schema(driver, link)
get_doc(link)
except:
print(f'failed {link}')
'''