pfs-invoices/utils/scrapper/scrapper.py

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import requests
from zipfile import ZipFile
from bs4 import BeautifulSoup
import json


def get_doc(link):
    resp = requests.get(link)
    soup = BeautifulSoup(resp.content, 'html.parser')

    t = soup.findAll('a', href=True, string='Descarga XML/XSD')[0]
    fname = t['href'].split('/')[-1].split('.')[0]
    table = soup.find_all('table')

    partial = []
    result = {"sections": []}
    curr = ''

    for r in table[0].find_all('tr')[1:]:
        props = [c.text.strip() for c in r.find_all('td')]
        filtered = [p for p in props]

        if len(filtered) > 1 and filtered[0]:
            partial.append({'name': filtered[0], 'text': filtered[-1]})

        if len(filtered) == 1:
            if curr:
                result['sections'].append({'name': curr, 'entries': partial[:]})
                partial = []
            curr = filtered[0].lower()

    if curr:
        result['sections'].append({'name': curr, 'entries': partial[:]})

    with open(fname.replace('XML', '') + '.json', 'w') as f:
        json.dump(result, f, indent=2)


def get_schema(driver, link):
    driver.get(link)
    sec = driver.find_element(By.LINK_TEXT, 'Descarga XML/XSD')
    zipfile = sec.get_attribute('href')
    response = requests.get(zipfile)
    if response.status_code == 200:
        fname = zipfile.split('/')[-1]
        with open(fname, 'wb') as file:
            file.write(response.content)
        with ZipFile(fname, 'r') as f:
            f.extractall()
        print('File downloaded successfully!')
    else:
        print("Failed to download the file.")


ll = 'https://siatinfo.impuestos.gob.bo/index.php/facturacion-en-linea/archivos-xml-xsd-de-facturas-electronicas/factura-comercial-de-exportacion-de-servicios'

get_doc(ll)

'''
with webdriver.Chrome() as driver:
    driver.get('https://siatinfo.impuestos.gob.bo/index.php/sistema-facturacion')
    sec = driver.find_element(By.XPATH, '//*[@id="item-195"]/ul')
    links = [aa.get_attribute('href') for aa in sec.find_elements(By.TAG_NAME, "a")]

    for link in links:
        if not link.startswith('http'):
            break

        try:
            # get_schema(driver, link)
            get_doc(link)
        except:
            print(f'failed {link}')
'''