pfs-invoices/utils/scrapper/scrapper.py

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import requests
from zipfile import ZipFile
from bs4 import BeautifulSoup
import json


def get_doc(link):
    resp = requests.get(link)
    soup = BeautifulSoup(resp.content, 'html.parser')

    t = soup.findAll('a', href=True, string='Descarga XML/XSD')[0]
    fname = t['href'].split('/')[-1].split('.')[0]
    table = soup.find_all('table')

    partial = []
    result = {"sections": []}
    curr = ''

    for r in table[0].find_all('tr')[1:]:
        props = [c.text.strip() for c in r.find_all('td')]
        filtered = [p for p in props]

        if len(filtered) > 1 and filtered[0]:
            partial.append({'name': filtered[0], 'text': filtered[-1]})

        if len(filtered) == 1:
            if curr:
                result['sections'].append({'name': curr, 'entries': partial[:]})
                partial = []
            curr = filtered[0].lower()

    if curr:
        result['sections'].append({'name': curr, 'entries': partial[:]})

    with open(fname.replace('XML', '') + '.json', 'w') as f:
        json.dump(result, f, indent=2)


def get_schema(driver, link):
    driver.get(link)
    sec = driver.find_element(By.LINK_TEXT, 'Descarga XML/XSD')
    zipfile = sec.get_attribute('href')
    response = requests.get(zipfile)
    if response.status_code == 200:
        fname = zipfile.split('/')[-1]
        with open(fname, 'wb') as file:
            file.write(response.content)
        with ZipFile(fname, 'r') as f:
            f.extractall()
        print('File downloaded successfully!')
    else:
        print("Failed to download the file.")


ll = 'https://siatinfo.impuestos.gob.bo/index.php/facturacion-en-linea/archivos-xml-xsd-de-facturas-electronicas/factura-comercial-de-exportacion-de-servicios'

get_doc(ll)

'''
with webdriver.Chrome() as driver:
    driver.get('https://siatinfo.impuestos.gob.bo/index.php/sistema-facturacion')
    sec = driver.find_element(By.XPATH, '//*[@id="item-195"]/ul')
    links = [aa.get_attribute('href') for aa in sec.find_elements(By.TAG_NAME, "a")]

    for link in links:
        if not link.startswith('http'):
            break

        try:
            # get_schema(driver, link)
            get_doc(link)
        except:
            print(f'failed {link}')
'''
init pfs-invoices 2024-04-06 15:21:22 +00:00			`from selenium import webdriver`
			`from selenium.webdriver.common.keys import Keys`
			`from selenium.webdriver.common.by import By`
			`import requests`
			`from zipfile import ZipFile`
			`from bs4 import BeautifulSoup`
			`import json`


			`def get_doc(link):`
			`resp = requests.get(link)`
			`soup = BeautifulSoup(resp.content, 'html.parser')`

			`t = soup.findAll('a', href=True, string='Descarga XML/XSD')[0]`
			`fname = t['href'].split('/')[-1].split('.')[0]`
			`table = soup.find_all('table')`

			`partial = []`
			`result = {"sections": []}`
			`curr = ''`

			`for r in table[0].find_all('tr')[1:]:`
			`props = [c.text.strip() for c in r.find_all('td')]`
			`filtered = [p for p in props]`

			`if len(filtered) > 1 and filtered[0]:`
			`partial.append({'name': filtered[0], 'text': filtered[-1]})`

			`if len(filtered) == 1:`
			`if curr:`
			`result['sections'].append({'name': curr, 'entries': partial[:]})`
			`partial = []`
			`curr = filtered[0].lower()`

			`if curr:`
			`result['sections'].append({'name': curr, 'entries': partial[:]})`

			`with open(fname.replace('XML', '') + '.json', 'w') as f:`
			`json.dump(result, f, indent=2)`


			`def get_schema(driver, link):`
			`driver.get(link)`
			`sec = driver.find_element(By.LINK_TEXT, 'Descarga XML/XSD')`
			`zipfile = sec.get_attribute('href')`
			`response = requests.get(zipfile)`
			`if response.status_code == 200:`
			`fname = zipfile.split('/')[-1]`
			`with open(fname, 'wb') as file:`
			`file.write(response.content)`
			`with ZipFile(fname, 'r') as f:`
			`f.extractall()`
			`print('File downloaded successfully!')`
			`else:`
			`print("Failed to download the file.")`


			`ll = 'https://siatinfo.impuestos.gob.bo/index.php/facturacion-en-linea/archivos-xml-xsd-de-facturas-electronicas/factura-comercial-de-exportacion-de-servicios'`

			`get_doc(ll)`

			`'''`
			`with webdriver.Chrome() as driver:`
			`driver.get('https://siatinfo.impuestos.gob.bo/index.php/sistema-facturacion')`
			`sec = driver.find_element(By.XPATH, '//*[@id="item-195"]/ul')`
			`links = [aa.get_attribute('href') for aa in sec.find_elements(By.TAG_NAME, "a")]`

			`for link in links:`
			`if not link.startswith('http'):`
			`break`

			`try:`
			`# get_schema(driver, link)`
			`get_doc(link)`
			`except:`
			`print(f'failed {link}')`
			`'''`