import requests
import datetime
import sqlite3
import json
import time
import glob

# dados e visualização
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# visualização interativa
import altair as alt
alt.renderers.enable("html")

# estatística
import pingouin as pg
import statsmodels.api as sm

# machine learning e NLP
import hdbscan
import umap.umap_ as umap

# mapa
import folium

# cores do lablivre
palette = ["#412355", "#F2701C", "#18CEE6"]

# o script de coleta de dados foi executado separadamente


def get_data(page: int) -> dict:
    url = "https://api.obrasgov.gestao.gov.br/obrasgov/api/projeto-investimento"
    params = {"uf": "DF", "pagina": page, "tamanhoDaPagina": 100}
    headers = {"accept": "*/*"}

    response = requests.get(url, params=params, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Failed to get data: {response.status_code}")

    return response.json()


def main():
    # deselegante mas resolve imediatamente
    for page in range(100):
        response = get_data(page)
        print(f"Page {page} processed")

        # salvar dados brutos para não depender da api
        with open(f"data/data-{page}.json", "w", encoding="utf-8") as f:
            json.dump(response["content"], f, indent=4)
        print(f"Data saved to data/data-{page}.json")

        time.sleep(1)

        page += 1


# if __name__ == "__main__":
#     main()

def load_json_files() -> list[dict]:
    json_files = glob.glob("data/data-*.json")
    all_records = []

    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            records = json.load(f)
            all_records.extend(records)

    return all_records

df = pd.DataFrame(load_json_files())
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 834 entries, 0 to 833
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   idUnico                             834 non-null    object
 1   nome                                834 non-null    object
 2   cep                                 400 non-null    object
 3   endereco                            430 non-null    object
 4   descricao                           834 non-null    object
 5   funcaoSocial                        834 non-null    object
 6   metaGlobal                          834 non-null    object
 7   dataInicialPrevista                 832 non-null    object
 8   dataFinalPrevista                   832 non-null    object
 9   dataInicialEfetiva                  23 non-null     object
 10  dataFinalEfetiva                    7 non-null      object
 11  dataCadastro                        834 non-null    object
 12  especie                             830 non-null    object
 13  natureza                            834 non-null    object
 14  naturezaOutras                      211 non-null    object
 15  situacao                            834 non-null    object
 16  descPlanoNacionalPoliticaVinculado  287 non-null    object
 17  uf                                  834 non-null    object
 18  qdtEmpregosGerados                  164 non-null    object
 19  descPopulacaoBeneficiada            174 non-null    object
 20  populacaoBeneficiada                169 non-null    object
 21  observacoesPertinentes              129 non-null    object
 22  isModeladaPorBim                    591 non-null    object
 23  dataSituacao                        834 non-null    object
 24  tomadores                           834 non-null    object
 25  executores                          834 non-null    object
 26  repassadores                        834 non-null    object
 27  eixos                               834 non-null    object
 28  tipos                               834 non-null    object
 29  subTipos                            834 non-null    object
 30  fontesDeRecurso                     834 non-null    object
dtypes: object(31)
memory usage: 2.3 MB

pd.set_option("display.max_columns", None)
df.head()

# colunas relativa a dados aninhados, represetando dados com relação many to many

nested_cols = [
    "tomadores",
    "executores",
    "repassadores",
    "eixos",
    "tipos",
    "subTipos",
    "fontesDeRecurso",
]

from analyse_duplicates import check_duplicates, analyze_false_duplicates

# existem duplicatas que só diferem no conteúdo das listas.
# não quero te entediar com esse código gerado por IA então movi para outro arquivo
# Para mais detalhes, consultar o arquivo `apendices/duplicate_records_report.txt`.

check_duplicates(df)
analyze_false_duplicates(df, "apendices/duplicate_records_report")

Columns with lists: ['tomadores', 'executores', 'repassadores', 'eixos', 'tipos', 'subTipos', 'fontesDeRecurso']
Number of duplicate rows (not considering list columns): 122
Number of duplicate rows: 87
Number of 'false duplicate' rows: 70
Unique idUnico values in false duplicates: 35
Report written to duplicate_records_report.txt

df.duplicated(subset=["idUnico"]).sum()
df.drop_duplicates(subset=["idUnico"], inplace=True)

# checar apenas colunas que não são aninhadas
object_cols = df.drop(columns=nested_cols, axis=1)

info_df = pd.DataFrame(
    {
        "NaN Values": object_cols.isna().sum(),
        "% NaN": (object_cols.isna().sum() / len(object_cols) * 100).round(1),
    }
)
info_df[info_df["% NaN"] > 1].sort_values(by="% NaN", ascending=False)

hoje = datetime.datetime.now()
df["dataFinalPrevista"] = pd.to_datetime(df["dataFinalPrevista"])

# Calcular quantos projetos estão atrasados
atrasados = df[df["dataFinalPrevista"] < hoje]
print(
    f"{len(atrasados)} projetos ({(len(atrasados)/len(df)*100):.1f}%) estão com data final prevista no passado"
)

516 projetos (72.5%) estão com data final prevista no passado

# erro de encoding. Texto em UTF-8, foi lido como Latin-1 (ISO-8859-1)
# toddos tem uma rodovia como endereço, indicando que o erro é relacionado à fonte dos dados.

enconding_issue = df[df["descricao"].str.contains("Ã§Ã")]
print(f"Pelo menos {enconding_issue.shape[0]} projetos com erro de encoding")
print("exemplo:", df[df["descricao"].str.contains("Ã§Ã")].head(1)["descricao"])

Pelo menos 18 projetos com erro de encoding
exemplo: 63    prestaÃ§Ã£o dos serviÃ§os de gerenciamento dos...
Name: descricao, dtype: object

def fix_encoding(item: any) -> any:
    if isinstance(item, str):
        try:
            return item.encode("latin-1").decode("utf-8")
        except:
            return item
    return item


for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].apply(fix_encoding)

# há um sujeito chamado Ronald Alves Vieira e ele está testando em produção
test_df = df[df["nome"].str.contains("Ronald", case=False, na=False)]
test_df.head()

# é possível que hajam mais casos que não encontrei
# é possível que essa limpeza remova dados reais

antes = df.shape[0]

df = df[~df["nome"].str.contains("Ronald", case=False, na=False)]
df = df[~df["nome"].str.contains("Teste", case=False, na=False)]

depois = df.shape[0]

print(
    f"Removidos {antes - depois} projetos por serem registros falsos (efeitos colaterais de testes de integração)"
)

Removidos 22 projetos por serem registros falsos (efeitos colaterais de testes de integração)

# remover ceps preenchidos com 1 ou espaço.
df["cep"] = df["cep"].str.replace(r"^1$", "", regex=True)
df["cep"] = df["cep"].str.replace(r"\s+", "", regex=True).replace("", None)

# normalizar ceps
df["cep"] = df["cep"].str.replace("-", "")
df["cep"] = df["cep"].str.replace(".", "")

# remover ceps com menos de 8 dígitos
df["cep"] = df["cep"].apply(lambda x: None if pd.isna(x) or len(str(x)) < 8 else x)

df["cep"].nunique()

88

# remover enderecos preenchidos com 1 ou espaço.
df["endereco"] = df["endereco"].str.strip()
df["endereco"] = df["endereco"].str.replace(r"^1$", "", regex=True)
df["endereco"] = df["endereco"].str.replace(r"^\s+$", "", regex=True).replace("", None)
df["endereco"].nunique()

233

df["isModeladaPorBim"] = df["isModeladaPorBim"].astype("boolean")

# variaveis categoricas
df["natureza"] = df["natureza"].astype("category")
df["situacao"] = df["situacao"].astype("category")
df["especie"] = df["especie"].astype("category")
df["uf"] = df["uf"].astype("category")

# variaveis de data
# verifiquei manualmente os registros para erros de formatação mas não encontrei problemas

date_cols = [
    "dataInicialPrevista",
    "dataFinalPrevista",
    "dataInicialEfetiva",
    "dataFinalEfetiva",
    "dataCadastro",
    "dataSituacao",
]

for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# alguns registros tem strings em vez de ints. esses registros também têm outros problemas; veja abaixo.

# converte para float em vez de int para permitir valores nulos, e transforma strings em nulos
df["qdtEmpregosGerados"] = pd.to_numeric(df["qdtEmpregosGerados"], errors="coerce")
df["populacaoBeneficiada"] = pd.to_numeric(df["populacaoBeneficiada"], errors="coerce")

def normalize_nested_column(
    df: pd.DataFrame, id_col: str, nested_col: str
) -> pd.DataFrame:
    def convert_dict_values_to_str(x):
        # pd.json_normalize(df['tomadores'].explode()) converte o id para float, o que pode causar problemas devido à imprecisão de ponto flutuante mesmo se convertido de volta para int/str. por isso, preciso converter os valores para strings primeiro, o que requer um código um pouco confuso:

        if not isinstance(x, list):
            return x
        return [{key: str(value) for key, value in item.items()} for item in x]

    df[nested_col] = df[nested_col].apply(convert_dict_values_to_str)
    exploded = df[[id_col, nested_col]].explode(nested_col)
    exploded = exploded.dropna(subset=[nested_col])
    normalized = pd.json_normalize(exploded[nested_col])
    normalized[id_col] = exploded[id_col].values
    return normalized


def create_entity_table_from_junction_table(
    old_df: pd.DataFrame, prev_index: str, actual_index: str
) -> pd.DataFrame:
    new_df = old_df.dropna(subset=[prev_index])
    new_df.drop_duplicates(subset=[actual_index], inplace=True)
    new_df.set_index(actual_index, inplace=True)
    new_df.drop(prev_index, axis=1, inplace=True)
    return new_df

# reuni essas tabelas em uma só, pois são todas referentes a instituicoes e usam o mesmo sistema de códigos.

projeto_tomadores_df = normalize_nested_column(df, "idUnico", "tomadores")
projeto_executores_df = normalize_nested_column(df, "idUnico", "executores")
projeto_repassadores_df = normalize_nested_column(df, "idUnico", "repassadores")

tomadores_df = create_entity_table_from_junction_table(
    projeto_tomadores_df, "idUnico", "codigo"
)
executores_df = create_entity_table_from_junction_table(
    projeto_executores_df, "idUnico", "codigo"
)
repassadores_df = create_entity_table_from_junction_table(
    projeto_repassadores_df, "idUnico", "codigo"
)

instituicoes_df = pd.concat([tomadores_df, executores_df, repassadores_df])

# deduplicate indices
instituicoes_df = instituicoes_df.loc[~instituicoes_df.index.duplicated(keep="first")]

instituicoes_df.head(1)

projeto_eixos_df = normalize_nested_column(df, "idUnico", "eixos")

eixos_df = create_entity_table_from_junction_table(projeto_eixos_df, "idUnico", "id")

eixos_df.head(1)

projeto_tipos_df = normalize_nested_column(df, "idUnico", "tipos")

tipos_df = create_entity_table_from_junction_table(projeto_tipos_df, "idUnico", "id")

tipos_df.head(1)

projeto_subtipos_df = normalize_nested_column(df, "idUnico", "subTipos")

subtipos_df = create_entity_table_from_junction_table(
    projeto_subtipos_df, "idUnico", "id"
)

subtipos_df.head(1)

# nesse caso é one to many e nao many to many, entao nao precisa da tabela intermediaria

fontes_de_recurso_df = normalize_nested_column(
    df, "idUnico", "fontesDeRecurso"
).reset_index(drop=True)

# nesse caso estava certo o float
fontes_de_recurso_df["valorInvestimentoPrevisto"] = pd.to_numeric(
    fontes_de_recurso_df["valorInvestimentoPrevisto"], errors="coerce"
)

fontes_de_recurso_df.head(1)

df.drop(columns=nested_cols, inplace=True)

text_cols = [
    "nome",
    "endereco",
    "descricao",
    "funcaoSocial",
    "metaGlobal",
    "especie",
    "natureza",
    "naturezaOutras",
    "situacao",
    "descPlanoNacionalPoliticaVinculado",
    "descPopulacaoBeneficiada",
    "observacoesPertinentes",
]


def make_text_total(row):
    text_total = []
    for x in row:
        if pd.notna(x):
            text_total.append(str(x))
    return "\n\n".join(text_total)


df["textoTotal"] = df[text_cols].apply(make_text_total, axis=1)

valor_por_id = (
    fontes_de_recurso_df.groupby("idUnico")["valorInvestimentoPrevisto"]
    .sum()
    .reset_index()
)

valor_por_id.rename(
    columns={"valorInvestimentoPrevisto": "investimentoTotal"}, inplace=True
)

df = df.merge(valor_por_id, on="idUnico", how="left", validate="one_to_one")

bins = [
    0,
    1,
    10,
    100,
    1_000,
    10_000,
    100_000,
    1_000_000,
    10_000_000,
    100_000_000,
    1_000_000_000,
    np.inf,
]
labels_faixa = [
    "<1",
    "1-10",
    "10-100",
    "100-1k",
    "1k-10k",
    "10k-100k",
    "100k-1M",
    "1M-10M",
    "10M-100M",
    "100M-1B",
    ">1B",
]

df["investimentoFaixa"] = pd.cut(df["investimentoTotal"], bins=bins, labels=labels_faixa, ordered=True)

def plot_investment_ranges(df, palette):
    faixa_counts = df["investimentoFaixa"].value_counts().reset_index()
    faixa_counts.columns = ["investimentoFaixa", "count"]

    plt.figure(figsize=(10, 4))
    ax = sns.barplot(
        data=faixa_counts,
        x="investimentoFaixa", 
        y="count",
        hue="investimentoFaixa",
        palette=[palette[0] for _ in range(len(faixa_counts))],
        order=labels_faixa
    )

    # custom palette creates multiple containers
    for container in ax.containers:
        ax.bar_label(container)  # type: ignore

    plt.xlabel("Faixa de Investimento (R$)")
    plt.ylabel("Projetos")
    plt.title("Projetos por Faixa de Investimento")
    plt.tight_layout()
    plt.show()


plot_investment_ranges(df, palette)

df["isInvestimentoSimbolico"] = (df["investimentoTotal"] <= 1.0).astype('boolean')
df['isInvestimentoSimbolico'].value_counts()

isInvestimentoSimbolico
False    571
True     119
Name: count, dtype: Int64

df["duracaoPrevistaDias"] = df["dataFinalPrevista"] - df["dataInicialPrevista"]
df["duracaoPrevistaDias"] = df["duracaoPrevistaDias"].dt.days
df["duracaoPrevistaDias"] = df["duracaoPrevistaDias"].astype(float)
df["duracaoPrevistaDias"].describe()

count     688.000000
mean      750.819767
std       688.251682
min         0.000000
25%       291.750000
50%       469.000000
75%      1090.250000
max      4827.000000
Name: duracaoPrevistaDias, dtype: float64

total = len(df)
missing = df["cep"].isna().sum()
print(f"Missing CEP values: {missing} out of {total} ({missing/total:.1%})")
ceps = df["cep"].dropna().unique()

Missing CEP values: 475 out of 690 (68.8%)

from geocode_ceps import cep_to_coords_viacep, cep_to_coords_ipedf

# para detalhes de obtenção dos valores, o arquivo `geocode_ceps.py`

# cep_to_coords_viacep(ceps.tolist(), "data/cep_coords_viacep.json")
# cep_to_coords_ipedf(ceps.tolist(), "data/cep_coords_ipedf.json")

with open("data/cep_coords_viacep.json") as f:
    data = json.load(f)
df_viacep = pd.DataFrame.from_dict(data, orient="index")

df_viacep.columns = ["latitude", "longitude"]

print(
    f"Null records: {df_viacep['latitude'].isna().sum()} out of {len(df_viacep)} ({df_viacep['latitude'].isna().sum()/len(df_viacep):.1%})"
)

with open("data/cep_coords_ipedf.json") as f:
    data = json.load(f)
df_ipedf = pd.DataFrame.from_dict(data, orient="index")

df_ipedf.columns = ["latitude", "longitude"]

print(
    f"Null records: {df_ipedf['latitude'].isna().sum()} out of {len(df_ipedf)} ({df_ipedf['latitude'].isna().sum()/len(df_ipedf):.1%})"
)

df = df.merge(
    df_viacep, left_on="cep", right_index=True, how="left", suffixes=("", "_viacep")
)

df = df.merge(
    df_ipedf, left_on="cep", right_index=True, how="left", suffixes=("", "_ipedf")
)

df = df.rename(
    columns={
        "latitude": "lat_viacep",
        "longitude": "lon_viacep",
        "latitude_ipedf": "lat_ipedf",
        "longitude_ipedf": "lon_ipedf",
    }
)

print(
    f"Records with coordinates from both sources: {df[['lon_viacep', 'lon_ipedf']].notna().all(axis=1).sum()}"
)

# Count records with and without coordinates
has_viacep = df[["lat_viacep", "lon_viacep"]].notna().all(axis=1)
has_ipedf = df[["lat_ipedf", "lon_ipedf"]].notna().all(axis=1)

print(
    f"Records with coordinates from either source: {(has_viacep | has_ipedf).sum()} ({(has_viacep | has_ipedf).sum()/len(df):.1%})"
)

Null records: 55 out of 88 (62.5%)
Null records: 47 out of 88 (53.4%)
Records with coordinates from both sources: 29
Records with coordinates from either source: 129 (18.7%)

DB_PATH = "projetos_DF_obrasgov.db"

with sqlite3.connect(DB_PATH) as con:
    con.execute("PRAGMA foreign_keys = ON")

    con.execute("DROP TABLE IF EXISTS projeto_subtipos")
    con.execute("DROP TABLE IF EXISTS projeto_tipos")
    con.execute("DROP TABLE IF EXISTS projeto_eixos")
    con.execute("DROP TABLE IF EXISTS projeto_repassadores")
    con.execute("DROP TABLE IF EXISTS projeto_executores")
    con.execute("DROP TABLE IF EXISTS projeto_tomadores")
    con.execute("DROP TABLE IF EXISTS fontes_de_recurso")
    con.execute("DROP TABLE IF EXISTS subtipos")
    con.execute("DROP TABLE IF EXISTS tipos")
    con.execute("DROP TABLE IF EXISTS eixos")
    con.execute("DROP TABLE IF EXISTS instituicoes")
    con.execute("DROP TABLE IF EXISTS projetos")

    con.execute(
        """
        CREATE TABLE projetos (
            -- Identification and basic info
            idUnico TEXT PRIMARY KEY NOT NULL CHECK (idUnico LIKE '____.__-__'),
            nome TEXT NOT NULL,
            
            -- Dates (sqlite does not support datetime - use text with format 'YYYY-MM-DD HH:MM:SS')
            dataInicialPrevista TEXT CHECK (dataInicialPrevista IS NULL OR dataInicialPrevista LIKE '____-__-__ __:__:__'),
            dataFinalPrevista TEXT CHECK (dataFinalPrevista IS NULL OR dataFinalPrevista LIKE '____-__-__ __:__:__'),
            dataInicialEfetiva TEXT CHECK (dataInicialEfetiva IS NULL OR dataInicialEfetiva LIKE '____-__-__ __:__:__'),
            dataFinalEfetiva TEXT CHECK (dataFinalEfetiva IS NULL OR dataFinalEfetiva LIKE '____-__-__ __:__:__'),
            dataCadastro TEXT CHECK (dataCadastro IS NULL OR dataCadastro LIKE '____-__-__ __:__:__'),
            dataSituacao TEXT CHECK (dataSituacao IS NULL OR dataSituacao LIKE '____-__-__ __:__:__'),
            
            -- Project details
            cep TEXT CHECK (cep IS NULL OR cep LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'),
            endereco TEXT, 
            descricao TEXT,
            funcaoSocial TEXT,
            metaGlobal TEXT,
            naturezaOutras TEXT,
            descPlanoNacionalPoliticaVinculado TEXT,
            descPopulacaoBeneficiada TEXT,
            observacoesPertinentes TEXT,
            textoTotal TEXT,
            qdtEmpregosGerados INTEGER,
            populacaoBeneficiada INTEGER,
            
            -- Categories and bools
            especie TEXT CHECK (especie IN ('Ampliação', 'Construção', 'Fabricação', 'Máquinas e Equipamentos', 'Recuperação', 'Reforma')),
            natureza TEXT CHECK (natureza IN ('Estudo', 'Obra', 'Outros', 'Projeto', 'Projeto de Investimento em Infraestrutura')),
            situacao TEXT CHECK (situacao IN ('Cadastrada', 'Cancelada', 'Concluída', 'Em execução', 'Inacabada', 'Inativada', 'Paralisada')),
            uf TEXT CHECK (uf = 'DF'),
            investimentoFaixa TEXT CHECK (investimentoFaixa IN ('<1', '1-10', '10-100', '100-1k', '1k-10k', '10k-100k', '100k-1M', '1M-10M', '10M-100M', '100M-1B', '>1B')),
            isModeladaPorBim INTEGER CHECK (isModeladaPorBim IN (0,1)),
            isInvestimentoSimbolico INTEGER CHECK (isInvestimentoSimbolico IN (0,1)),
            
            -- Investment info (money - don`t use floats)
            investimentoTotal INTEGER,
            duracaoPrevistaDias INTEGER,
            
            -- Geolocation
            lat_viacep REAL,
            lon_viacep REAL,
            lat_ipedf REAL,
            lon_ipedf REAL
        ) STRICT
    """
    )

    # Create entity tables
    con.execute(
        """
        CREATE TABLE instituicoes (
            codigo TEXT PRIMARY KEY NOT NULL,
            nome TEXT NOT NULL UNIQUE
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE eixos (
            id TEXT PRIMARY KEY NOT NULL,
            descricao TEXT NOT NULL UNIQUE
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE tipos (
            id TEXT PRIMARY KEY NOT NULL,
            descricao TEXT NOT NULL UNIQUE
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE subtipos (
            id TEXT PRIMARY KEY NOT NULL,
            descricao TEXT NOT NULL UNIQUE,
            idTipo TEXT NOT NULL,
            FOREIGN KEY (idTipo) REFERENCES tipos(id)
        ) STRICT
    """
    )

    # valorInvestimentoPrevisto == INT money
    con.execute(
        """
        CREATE TABLE fontes_de_recurso (
            origem TEXT NOT NULL,
            valorInvestimentoPrevisto INTEGER,
            idUnico TEXT NOT NULL,
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico)
        ) STRICT
    """
    )

    # Create junction tables
    con.execute(
        """
        CREATE TABLE projeto_tomadores (
            idUnico TEXT NOT NULL,
            codigo TEXT NOT NULL,
            PRIMARY KEY (idUnico, codigo),
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico),
            FOREIGN KEY (codigo) REFERENCES instituicoes(codigo)
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE projeto_executores (
            idUnico TEXT NOT NULL,
            codigo TEXT NOT NULL,
            PRIMARY KEY (idUnico, codigo),
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico),
            FOREIGN KEY (codigo) REFERENCES instituicoes(codigo)
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE projeto_repassadores (
            idUnico TEXT NOT NULL,
            codigo TEXT NOT NULL,
            PRIMARY KEY (idUnico, codigo),
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico),
            FOREIGN KEY (codigo) REFERENCES instituicoes(codigo)
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE projeto_eixos (
            idUnico TEXT NOT NULL,
            id TEXT NOT NULL,
            PRIMARY KEY (idUnico, id),
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico),
            FOREIGN KEY (id) REFERENCES eixos(id)
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE projeto_tipos (
            idUnico TEXT,
            id TEXT,
            PRIMARY KEY (idUnico, id),
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico),
            FOREIGN KEY (id) REFERENCES tipos(id)
        ) STRICT
    """
    )

    con.execute(
        """
        CREATE TABLE projeto_subtipos (
            idUnico TEXT NOT NULL,
            id TEXT NOT NULL,
            PRIMARY KEY (idUnico, id),
            FOREIGN KEY (idUnico) REFERENCES projetos(idUnico),
            FOREIGN KEY (id) REFERENCES subtipos(id)
        ) STRICT
    """
    )

def normalize_date_cols(df, cols):
    def to_datetime_str(dt):
        # Returns "YYYY-MM-DD HH:MM:SS" or None
        if pd.isna(dt):
            return None
        ts = pd.to_datetime(dt, errors="coerce")
        return None if pd.isna(ts) else ts.strftime("%Y-%m-%d %H:%M:%S")

    for c in cols:
        if c in df.columns:
            df[c] = df[c].map(to_datetime_str)


def normalize_cep(df, col="cep"):
    if col in df.columns:
        s = df[col].astype(str)
        s = s.str.replace(r"\D", "", regex=True)
        s = s.where(s.str.fullmatch(r"\d{8}"))
        df[col] = s


def normalize_bool_cols(df, cols):
    def to_bool_01(s):
        if pd.isna(s):
            return pd.NA
        if isinstance(s, bool):
            return 1 if s else 0
        return pd.NA

    for c in cols:
        if c in df.columns:
            df[c] = df[c].map(to_bool_01).astype("Int64")


def normalize_money_cents(df, col_names):
    def to_cents(x):
        if pd.isna(x):
            return pd.NA
        try:
            cents = int(round(x * 100))
            return cents
        except Exception:
            return pd.NA

    for c in col_names:
        if c in df.columns:
            df[c] = df[c].map(to_cents).astype("Int64")


def normalize_int_cols(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")


def normalize_float_cols(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")


def restrict_to_set(df, col, allowed_set):
    if col in df.columns:
        df[col] = df[col].where(df[col].isin(allowed_set))

date_cols = [
    "dataInicialPrevista",
    "dataFinalPrevista",
    "dataInicialEfetiva",
    "dataFinalEfetiva",
    "dataCadastro",
    "dataSituacao",
]

allowed_especie = {
    "Ampliação",
    "Construção",
    "Fabricação",
    "Máquinas e Equipamentos",
    "Recuperação",
    "Reforma",
}
allowed_natureza = {
    "Estudo",
    "Obra",
    "Outros",
    "Projeto",
    "Projeto de Investimento em Infraestrutura",
}
allowed_situacao = {
    "Cadastrada",
    "Cancelada",
    "Concluída",
    "Em execução",
    "Inacabada",
    "Inativada",
    "Paralisada",
}
allowed_uf = {
    "DF",
}
allowed_investimento_faixa = {
    "<1",
    "1-10", 
    "10-100",
    "100-1k",
    "1k-10k",
    "10k-100k",
    "100k-1M",
    "1M-10M", 
    "10M-100M",
    "100M-1B",
    ">1B"
}

df_to_save = df.copy()

normalize_date_cols(df_to_save, date_cols)
normalize_cep(df_to_save, "cep")

restrict_to_set(df_to_save, "especie", allowed_especie)
restrict_to_set(df_to_save, "natureza", allowed_natureza)
restrict_to_set(df_to_save, "situacao", allowed_situacao)

normalize_bool_cols(df_to_save, ["isModeladaPorBim"])
normalize_bool_cols(df_to_save, ["isInvestimentoSimbolico"])

normalize_money_cents(df_to_save, ["investimentoTotal"])
normalize_int_cols(
    df_to_save,
    ["duracaoPrevistaDias", "qdtEmpregosGerados", "populacaoBeneficiada"],
)
normalize_float_cols(
    df_to_save,
    ["lat_viacep", "lon_viacep", "lat_ipedf", "lon_ipedf"],
)

normalize_money_cents(fontes_de_recurso_df, ["valorInvestimentoPrevisto"])

with sqlite3.connect(DB_PATH) as con:
    con.execute("PRAGMA foreign_keys = ON")

    df_to_save.to_sql("projetos", con, if_exists="replace", index=False)

    instituicoes_df.to_sql("instituicoes", con, if_exists="replace", index=False)
    eixos_df.to_sql("eixos", con, if_exists="replace", index=False)
    tipos_df.to_sql("tipos", con, if_exists="replace", index=False)
    subtipos_df.to_sql("subtipos", con, if_exists="replace", index=False)
    fontes_de_recurso_df.to_sql(
        "fontes_de_recurso", con, if_exists="replace", index=False
    )

    projeto_tomadores_df.to_sql(
        "projeto_tomadores", con, if_exists="replace", index=False
    )
    projeto_executores_df.to_sql(
        "projeto_executores", con, if_exists="replace", index=False
    )
    projeto_repassadores_df.to_sql(
        "projeto_repassadores", con, if_exists="replace", index=False
    )
    projeto_eixos_df.to_sql("projeto_eixos", con, if_exists="replace", index=False)
    projeto_tipos_df.to_sql("projeto_tipos", con, if_exists="replace", index=False)
    projeto_subtipos_df.to_sql(
        "projeto_subtipos", con, if_exists="replace", index=False
    )

def load_main_table_from_sql():
    with sqlite3.connect(DB_PATH) as con:
        df = pd.read_sql_query("SELECT * FROM projetos", con)

        # dates
        df["dataInicialPrevista"] = pd.to_datetime(df["dataInicialPrevista"])
        df["dataFinalPrevista"] = pd.to_datetime(df["dataFinalPrevista"])
        df["dataInicialEfetiva"] = pd.to_datetime(df["dataInicialEfetiva"])
        df["dataFinalEfetiva"] = pd.to_datetime(df["dataFinalEfetiva"])
        df["dataCadastro"] = pd.to_datetime(df["dataCadastro"])
        df["dataSituacao"] = pd.to_datetime(df["dataSituacao"])

        # bools
        df["isModeladaPorBim"] = df["isModeladaPorBim"].astype("boolean")
        df["isInvestimentoSimbolico"] = df["isInvestimentoSimbolico"].astype("boolean")

        # cats
        df['especie'] = df['especie'].astype(pd.CategoricalDtype(allowed_especie))
        df['natureza'] = df['natureza'].astype(pd.CategoricalDtype(allowed_natureza))
        df['situacao'] = df['situacao'].astype(pd.CategoricalDtype(allowed_situacao))
        df['uf'] = df['uf'].astype(pd.CategoricalDtype(allowed_uf))

        df['investimentoFaixa'] = df['investimentoFaixa'].astype(pd.CategoricalDtype(allowed_investimento_faixa, ordered=True))

        # money --- important!
        df["investimentoTotal"] = df["investimentoTotal"].astype(float) / 100

        # floats
        df["lat_viacep"] = df["lat_viacep"].astype(float)
        df["lon_viacep"] = df["lon_viacep"].astype(float)
        df["lat_ipedf"] = df["lat_ipedf"].astype(float)
        df["lon_ipedf"] = df["lon_ipedf"].astype(float)
        
    return df

df_from_sql = load_main_table_from_sql()

from pandas.testing import assert_frame_equal

try:
    assert_frame_equal(df_from_sql, 
                   df, check_like=True, 
                   check_categorical=False,
                   )
    
    print("Os dados recuperados via SQL são idênticos aos originais!")
except Exception as e:
    print(e)

Os dados recuperados via SQL são idênticos aos originais!

df = df_from_sql.copy()

df.head()

def plot_bim_pie(df: pd.DataFrame, palette: list) -> None:
    # Convert boolean to labels with proper ordering
    df["bim_label"] = (
        df["isModeladaPorBim"].map({True: "Sim", False: "Não"}).fillna("Sem dados")
    )
    bim_counts = df["bim_label"].value_counts()

    fig, ax = plt.subplots(figsize=(4, 4))

    # Create combined labels with percentages
    combined_labels = [
        f"{label}\n({pct:.1f}%)"
        for label, count, pct in zip(
            bim_counts.index,
            bim_counts.values,
            bim_counts.values / bim_counts.sum() * 100,
        )
    ]

    wedges, texts, autotexts = ax.pie(
        bim_counts.values,
        labels=combined_labels,  # Use combined labels
        colors=palette,
        autopct="",  # Remove separate percentage text
        labeldistance=1.25,
        wedgeprops=dict(width=0.5, linewidth=0),
    )

    for text in texts:
        text.set_ha("center")
        text.set_ha("center")

    ax.set_title("Projeto modelado usando BIM\n(Building Information Modeling)")
    plt.tight_layout()
    plt.show()


plot_bim_pie(df, palette)

def make_barh_plot(
    df: pd.DataFrame,
    column: str,
    title: str,
    color: str,
    extra_space: float = 1,
    show_percentages: bool = True,
    labels_inside: bool = False,
    ax=None,
):
    """Modified to accept ax parameter for subplotting"""
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))

    # Get counts and percentages
    value_counts = df[column].value_counts(dropna=False)
    percentages = (value_counts / len(df)) * 100

    # Create DataFrame with both counts and percentages
    value_counts_df = pd.DataFrame(
        {"count": value_counts, column: value_counts.index, "percentage": percentages}
    ).reset_index(drop=True)

    sns.barplot(
        data=value_counts_df,
        x="count",
        y=column,
        hue=column,
        orient="h",
        order=value_counts_df[column],
        palette=[color for _ in range(len(value_counts_df))],
        legend=False,
        ax=ax,
    )

    # Add labels with count and optionally percentage
    for i, v in enumerate(value_counts_df["count"]):
        if show_percentages:
            label = f"{int(v)} ({percentages.iloc[i]:.1f}%)"
        else:
            label = f"{int(v)}"

        # Position labels inside or outside bars based on labels_inside parameter
        if labels_inside:
            x_pos = v - 5  # Shift inside
            ha = "right"
        else:
            x_pos = v + 5  # Shift outside
            ha = "left"

        ax.text(x_pos, i, label, va="center", ha=ha)

    # Extend x-axis to accommodate labels if they're outside
    max_val = value_counts_df["count"].max()
    if not labels_inside:
        ax.set_xlim(0, max_val * extra_space)
    else:
        ax.set_xlim(0, max_val)

    ax.set_title(title)
    ax.set_xlabel("Quantidade")
    ax.set_ylabel("")
    plt.tight_layout()

    if ax is None:
        plt.show()

# Plot 1: Natureza
_, ax = plt.subplots(figsize=(10, 3))
make_barh_plot(
    df, "natureza", "Natureza dos projetos", palette[0], ax=ax, extra_space=1.2
)
# Plot 2: Situação
_, ax = plt.subplots(figsize=(10, 3))
make_barh_plot(
    df, "situacao", "Situação dos projetos", palette[1], ax=ax, extra_space=1.14
)

# Plot 3: Espécie
_, ax = plt.subplots(figsize=(10, 3))

especie_vis = df.copy()
especie_vis["especie"] = (
    especie_vis["especie"].cat.add_categories("Sem dados").fillna("Sem dados")
)

make_barh_plot(
    especie_vis, "especie", "Espécies de projetos", palette[2], ax=ax, extra_space=1.15
)

# Plot normal scale
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(
    data=df[~df["isInvestimentoSimbolico"]],
    x="investimentoTotal",
    bins=25,
    color=palette[0],
)
plt.xlabel("Investimento (R$)")
plt.ylabel("Projetos")
plt.title("Projetos por Investimento")

# Plot log scale
plt.subplot(1, 2, 2)
sns.histplot(
    data=df[~df["isInvestimentoSimbolico"]],
    x="investimentoTotal",
    bins=25,
    color=palette[0],
    log_scale=True,
)
plt.xlabel("Investimento (R$)")
plt.ylabel("Projetos")
plt.title("Projetos por Investimento (escala logarítmica)")

plt.tight_layout()
plt.show()

df[~df["isInvestimentoSimbolico"]]["investimentoTotal"].describe().apply(
    lambda x: f"{x:,.2f}"
)

count            571.00
mean      13,871,387.02
std       51,931,177.66
min            4,684.45
25%          667,980.23
50%        2,085,189.23
75%        8,402,637.63
max      839,664,954.32
Name: investimentoTotal, dtype: object

def calculate_iqr(data: pd.Series) -> float:
    q75 = data.quantile(0.75)
    q25 = data.quantile(0.25)
    return q75 - q25


valores = df[~df["isInvestimentoSimbolico"]]["investimentoTotal"]
iqr = calculate_iqr(valores)
print(f"IQR: R$ {iqr:,.2f}")

IQR: R$ 7,734,657.40

# Calcular porcentagem de projetos com investimento simbólico vs total
total_projetos = len(df)
projetos_significativos = len(df[~df["isInvestimentoSimbolico"]])
projetos_simbolicos = len(df[df["isInvestimentoSimbolico"]])

print(
    f"Projetos com investimento simbólico: {projetos_simbolicos} ({projetos_simbolicos/total_projetos*100:.1f}%)"
)

Projetos com investimento simbólico: 119 (17.2%)

date_cols = [
    "dataInicialPrevista",
    "dataFinalPrevista",
    "dataInicialEfetiva",
    "dataFinalEfetiva",
    "dataCadastro",
    "dataSituacao",
]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

# Plot normal scale
sns.histplot(
    data=df["duracaoPrevistaDias"].divide(365), bins=50, color=palette[0], ax=ax1
)
ax1.set_xlabel("Duração Prevista (anos)")
ax1.set_ylabel("Projetos")
ax1.set_title("Projetos por Duração Prevista")

# Plot log scale
sns.histplot(
    data=df["duracaoPrevistaDias"].divide(365),
    bins=50,
    color=palette[0],
    log_scale=True,
    ax=ax2,
)
ax2.set_xlabel("Duração Prevista (anos)")
ax2.set_ylabel("Projetos")
ax2.set_title("Projetos por Duração Prevista\n(escala logarítmica)")

plt.tight_layout()
plt.show()

df["duracaoPrevistaDias"].divide(365).describe().apply(lambda x: f"{x:,.2f}")

count    688.00
mean       2.06
std        1.89
min        0.00
25%        0.80
50%        1.28
75%        2.99
max       13.22
Name: duracaoPrevistaDias, dtype: object

print("IQR:", calculate_iqr(df["duracaoPrevistaDias"].divide(365)))

IQR: 2.187671232876712

# valores baixos
df[["descricao", "duracaoPrevistaDias", "investimentoFaixa"]].sort_values(
    by="duracaoPrevistaDias", ascending=True
).head()

def plot_date_distributions(df, palette):
    from matplotlib import dates as mdates

    # Create subplots for date distributions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), sharey=True)

    # Plot initial dates
    sns.histplot(data=df, x="dataInicialPrevista", bins=25, color=palette[0], ax=ax1)
    ax1.set_title("Datas Iniciais Previstas")
    ax1.set_ylabel("Frequência")
    ax1.set_xlabel("")
    ax1.tick_params(axis="x", rotation=45)

    # Plot final dates
    sns.histplot(data=df, x="dataFinalPrevista", bins=25, color=palette[0], ax=ax2)
    ax2.set_title("Datas Finais Previstas")
    ax2.set_ylabel("")
    ax2.set_xlabel("")
    ax2.tick_params(axis="x", rotation=45)

    # Set more frequent x-axis ticks
    ax1.xaxis.set_major_locator(mdates.YearLocator(2))  # Show every 2 years
    ax2.xaxis.set_major_locator(mdates.YearLocator(2))  # Show every 2 years

    # Format dates on x-axis
    ax1.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    ax2.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

    # Adjust layout
    plt.tight_layout()
    plt.show()


plot_date_distributions(df, palette)

# Preparar dados
data_for_reg = df[~df["isInvestimentoSimbolico"]].copy()
data_for_reg = data_for_reg[
    ["idUnico", "descricao", "duracaoPrevistaDias", "investimentoTotal"]
].dropna()


data_for_reg["investimentoTotal_log"] = np.log(data_for_reg["investimentoTotal"])
data_for_reg["duracaoPrevistaDias_log"] = np.log(data_for_reg["duracaoPrevistaDias"])

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# 1. Escala original
sns.regplot(
    data=data_for_reg,
    x="duracaoPrevistaDias",
    y="investimentoTotal",
    scatter_kws={"alpha": 0.5},
    line_kws={"color": "red"},
    ax=axes[0],
)
axes[0].set_title("Relação entre Duração e Investimento")
axes[0].set_xlabel("Duração Prevista (dias)")
axes[0].set_ylabel("Investimento Total (R$)")

# 2. Log-log
sns.regplot(
    data=data_for_reg,
    x="duracaoPrevistaDias_log",
    y="investimentoTotal_log",
    scatter_kws={"alpha": 0.5},
    line_kws={"color": "red"},
    ax=axes[1],
)
axes[1].set_title("Relação entre Duração e Investimento (log-log)")
axes[1].set_xlabel("Duração Prevista (log dias)")
axes[1].set_ylabel("Investimento Total (log R$)")

plt.tight_layout()
plt.show()

/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/numpy/_core/function_base.py:162: RuntimeWarning: invalid value encountered in multiply
  y *= step
/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/numpy/_core/function_base.py:172: RuntimeWarning: invalid value encountered in add
  y += start
/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/numpy/lib/_nanfunctions_impl.py:1620: RuntimeWarning: All-NaN slice encountered
  return fnb._ureduce(a,

pg.corr(
    df[~df["isInvestimentoSimbolico"]]["duracaoPrevistaDias"],
    df[~df["isInvestimentoSimbolico"]]["investimentoTotal"],
    method="spearman",  # nao parametrico
)

# Replace inf values with NaN
data_for_reg["duracaoPrevistaDias_log"] = data_for_reg[
    "duracaoPrevistaDias_log"
].replace([np.inf, -np.inf], np.nan)
data_for_reg["investimentoTotal_log"] = data_for_reg["investimentoTotal_log"].replace(
    [np.inf, -np.inf], np.nan
)

# Drop rows with NaN
data_clean = data_for_reg.dropna()

X = sm.add_constant(data_clean["duracaoPrevistaDias_log"])
y = data_clean["investimentoTotal_log"]
model = sm.OLS(y, X).fit()

data_clean["residuos"] = model.resid
data_clean["residuos_abs"] = np.abs(model.resid)

data_clean.nlargest(10, "residuos_abs").assign(
    duracaoPrevistaDias_years=lambda x: x["duracaoPrevistaDias"],
    investimentoTotal_k=lambda x: x["investimentoTotal"].apply(lambda x: f"{x:,.2f}"),
)[
    ["idUnico", "descricao", "duracaoPrevistaDias", "investimentoTotal_k", "residuos"]
].round(
    {"duracaoPrevistaDias_years": 1, "investimentoTotal_k": 0, "residuos": 1}
)

/tmp/ipykernel_5144/1433076602.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean["residuos"] = model.resid
/tmp/ipykernel_5144/1433076602.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean["residuos_abs"] = np.abs(model.resid)

def create_investment_map(df):
    # Create map centered on DF
    m = folium.Map(location=[-15.7942, -47.8822], zoom_start=10)

    # Define single color for all markers
    COLOR = "#412355"  # Purple (LabLivre palette)

    # Prepare data with proper coordinate selection
    df_map = df[
        [
            "cep",
            "nome",
            "descricao",
            "lat_ipedf",
            "lon_ipedf",
            "lat_viacep",
            "lon_viacep",
            "investimentoTotal",
        ]
    ].copy()

    # Add markers for each project with coordinates
    for idx, row in df_map.iterrows():
        lat_ipedf = row["lat_ipedf"]
        lon_ipedf = row["lon_ipedf"]
        lat_viacep = row["lat_viacep"]
        lon_viacep = row["lon_viacep"]

        # Determine which coordinates to use (prioritize IPEDF)
        if pd.notna(lat_ipedf) and pd.notna(lon_ipedf):
            lat, lon = lat_ipedf, lon_ipedf
            source = "IPEDF"
        elif pd.notna(lat_viacep) and pd.notna(lon_viacep):
            lat, lon = lat_viacep, lon_viacep
            source = "ViaCEP"
        else:
            # No coordinates available
            continue

        # Prepare popup text
        popup_text = (
            f"""
        <b>CEP:</b> {row['cep']}<br>
        <b>Fonte:</b> {source}<br>
        <b>Nome:</b> {row['nome'][:50]}...<br>
        <b>Investimento:</b> R$ {row['investimentoTotal']:,.2f}
        """
            if pd.notna(row["investimentoTotal"])
            else f"""
        <b>CEP:</b> {row['cep']}<br>
        <b>Fonte:</b> {source}<br>
        <b>Nome:</b> {row['nome'][:50]}...
        """
        )

        # Add marker with radius scaled by investment amount
        folium.CircleMarker(
            location=[lat, lon],
            radius=(
                np.sqrt(row["investimentoTotal"] / 100000)
                if pd.notna(row["investimentoTotal"])
                else 5
            ),
            color=COLOR,
            fill=True,
            fillColor=COLOR,
            fillOpacity=0.7,
            popup=folium.Popup(popup_text, max_width=300),
            tooltip=f"{row['cep']} - {source}",
        ).add_to(m)

    return m


m = create_investment_map(df)
m

# df[['idUnico', 'textoTotal']].to_parquet('data/projetos_sem_vetores.parquet')

# # executar em um computador com GPU

# def get_embeddings(df: pd.DataFrame, model: SentenceTransformer) -> pd.DataFrame:
#     # https://huggingface.co/google/embeddinggemma-300m

#     # requires HF_TOKEN env variable
#     from dotenv import load_dotenv
#     load_dotenv()


#     # gated model: only unlocks if you sign the terms and conditions consent form in the huggingface model page
#     from sentence_transformers import SentenceTransformer
#     model = SentenceTransformer("google/embeddinggemma-300m")

#     sentences = df["textoTotal"].tolist()

#     embeddings = model.encode(
#         sentences=sentences,
#         prompt_name="Clustering", # gemma specific
#         show_progress_bar=True,
#         convert_to_numpy=True,
#     )

#     sentences_with_embeddings = pd.DataFrame(
#         {"texto": sentences, "embeddings": embeddings.tolist()}
#     )
#     df_with_embeddings = pd.merge(
#         df, sentences_with_embeddings, left_on="textoTotal", right_on="texto"
#     )
#     df_with_embeddings = df_with_embeddings.drop("texto", axis=1)
#     return df_with_embeddings


# textos_df = pd.read_parquet('data/projetos_sem_vetores.parquet')
# df_with_embeddings = get_embeddings(textos_df, model)
# df_with_embeddings.to_parquet('data/projetos_com_vetores.parquet')

embeddings = pd.read_parquet("data/projetos_com_vetores.parquet")
df = df.merge(embeddings.drop("textoTotal", axis=1), on="idUnico", how="left")

def umap_df(
    df: pd.DataFrame, vectors_col: str, dimension_names: list[str]
) -> pd.DataFrame:
    # Based on https://umap-learn.readthedocs.io/en/latest/

    reducer = umap.UMAP(
        n_components=2,
        n_neighbors=15,  # Default from documentation
        min_dist=0.1,  # Default from documentation
        metric="euclidean",  # Default metric
        random_state=42,
        verbose=False,
    )

    umap_components = reducer.fit_transform(df[vectors_col].tolist())
    umap_df = pd.DataFrame(umap_components, columns=dimension_names)  # type: ignore
    return pd.concat([df, umap_df], axis=1)


def hdbscan_df(
    df: pd.DataFrame, dimension_names: list[str], min_cluster_size: int
) -> pd.DataFrame:
    # https://github.com/scikit-learn-contrib/hdbscan

    clusterer = hdbscan.HDBSCAN(min_cluster_size)
    clusters = clusterer.fit_predict(df[dimension_names])
    df["cluster"] = clusters
    df["cluster"] = df["cluster"].astype(str)

    return df

dimension_names = ["UMAP_1", "UMAP_2"]
df = umap_df(df, "embeddings", dimension_names)
df = hdbscan_df(df, dimension_names, min_cluster_size=20)

/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
  warnings.warn(
/home/noah-art3mis/projects/takehome-lablivre-analysis/.venv/lib/python3.10/site-packages/sklearn/utils/deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
  warnings.warn(

def plot_embeddings_interactive(
    df: pd.DataFrame, dimension_names: list[str], hue: str
) -> alt.Chart:
    chart = (
        alt.Chart(df)
        .mark_circle(size=100, opacity=0.7, strokeWidth=0.5, stroke="white")
        .encode(
            x=alt.X(dimension_names[0], title="UMAP1"),
            y=alt.Y(dimension_names[1], title="UMAP2"),
            color=alt.Color(
                hue,
                scale=alt.Scale(scheme="tableau10"),
                legend=alt.Legend(title="Cluster"),
            ),
            tooltip=[
                alt.Tooltip(hue, title="Grupo"),
                alt.Tooltip("idUnico", title="ID"),
                alt.Tooltip("descricao", title="Descrição"),
                # alt.Tooltip("investimentoTotal", title="Investimento"),
                alt.Tooltip("duracaoPrevistaDias", title="Duração Prevista"),
                alt.Tooltip("natureza", title="Natureza"),
                # alt.Tooltip("situacao", title="Situação"),
                alt.Tooltip("especie", title="Espécie"),
                alt.Tooltip("investimentoFaixa", title="Investimento Faixa"),
                # alt.Tooltip("dataInicialPrevista", title="Data Início Prevista"),
                # alt.Tooltip("dataFinalPrevista", title="Data Final Prevista"),
            ],
        )
        .properties(
            width=600,
            height=400,
            title="Representação UMAP dos embeddings dos projetos com agrupamento (HDBSCAN)",
        )
        .interactive()
    )  # Enables pan and zoom

    return chart

plot_embeddings_interactive(df, dimension_names, hue="cluster")

cluster_names = {
    0: "Cobertura de quadra escolar",
    1: "Educação básica",
    2: "Saúde",
    3: "Energia",
    4: "Educação superior",
    5: "Contratação",
    6: "Engenharia civil",
    7: "Transporte",
    8: "Serviço social",
    -1: "Outros",
}

df["clusterNome"] = df["cluster"].astype(str).map(lambda x: cluster_names[int(x)])
df[["idUnico", "cluster", "clusterNome"]]

_, ax = plt.subplots(figsize=(10, 3))
make_barh_plot(
    df,
    "clusterNome",
    "Grupos de projetos",
    palette[1],
    extra_space=1.17,
    show_percentages=True,
    ax=ax,
)

def set_categorical_order(df_to_order, df_reference, column):
    order = df_reference[column].value_counts().index
    df_to_order[column] = pd.Categorical(
        df_to_order[column], categories=order, ordered=True
    )
    return df_to_order

# Agregar investimento total por grupo (clusterNome)
cluster_investimentos_df = (
    df.groupby("clusterNome", dropna=False)["investimentoTotal"]
    .sum()
    .reset_index()
    .rename(columns={"investimentoTotal": "investimentoPorArea"})
)

# Converter para milhões e calcular percentuais
cluster_investimentos_df["investimentoPorArea"] = (
    cluster_investimentos_df["investimentoPorArea"] / 1_000_000
)
total_investimento = cluster_investimentos_df["investimentoPorArea"].sum()
cluster_investimentos_df["percentual"] = (
    cluster_investimentos_df["investimentoPorArea"] / total_investimento
) * 100

# Usar a mesma ordem do gráfico anterior
order = df["clusterNome"].value_counts().index
cluster_investimentos_df["clusterNome"] = pd.Categorical(
    cluster_investimentos_df["clusterNome"], categories=order, ordered=True
)
cluster_investimentos_df = cluster_investimentos_df.sort_values("clusterNome")

# Criar figura com um subplot
plt.figure(figsize=(10, 3))

# Plotar valores absolutos
ax = sns.barplot(
    data=cluster_investimentos_df,
    x="investimentoPorArea",
    y="clusterNome",
    color=palette[0],
    orient="h",
)

# Rótulos com valores absolutos e percentuais
for i, (v, p) in enumerate(
    zip(
        cluster_investimentos_df["investimentoPorArea"],
        cluster_investimentos_df["percentual"],
    )
):
    ax.text(v * 1.01, i, f"R$ {v:,.0f}M ({p:.1f}%)", va="center", ha="left")

ax.set_title("Investimento total por grupo de projetos")
ax.set_xlabel("Investimento total (milhões R$)")
ax.set_ylabel("")

max_val = cluster_investimentos_df["investimentoPorArea"].max()
ax.set_xlim(0, max_val * 1.24)

plt.tight_layout()
plt.show()

_, ax = plt.subplots(figsize=(10, 3))
make_barh_plot(
    df,
    "clusterNome",
    "Grupos de projetos",
    palette[1],
    extra_space=1.17,
    show_percentages=True,
    ax=ax,
)

f"Total de investimentos: R$ {df['investimentoTotal'].sum():,.2f}"

'Total de investimentos: R$ 8,402,336,549.57'

# Calcular mediana de caracteres da descrição por cluster
desc_len = (
    df.groupby("clusterNome")
    .agg(
        n_projetos=("idUnico", "nunique"),
        mediana_caracteres=("descricao", lambda x: x.str.len().median()),
    )
    .round(1)
)

# Reordenar usando a função set_categorical_order
desc_len = set_categorical_order(desc_len.reset_index(), df, "clusterNome")

desc_len = desc_len.sort_values("mediana_caracteres", ascending=True)

plt.figure(figsize=(10, 4))
ax = sns.barplot(
    data=desc_len, x="mediana_caracteres", y="clusterNome", color=palette[0], orient="h"
)

# Adicionar rótulos com valores
for bar in ax.patches:
    width = bar.get_width()
    y = bar.get_y() + bar.get_height() / 2
    ax.text(width * 1.01, y, f"{width:,.0f}", va="center", ha="left")

ax.set_title("Mediana de caracteres na descrição por grupo de projetos")
ax.set_xlabel("Número mediano de caracteres")
ax.set_ylabel("")

plt.tight_layout()
plt.show()

# Create butterfly plot comparing number of projects vs investment by cluster
def create_butterfly_plot(df, palette):
    """
    Create a butterfly plot comparing number of projects vs investment by cluster
    """
    # Prepare data for butterfly plot
    butterfly_data = (
        df.groupby("clusterNome")
        .agg(
            {
                "idUnico": "nunique",  # Count unique projects
                "investimentoTotal": "sum",  # Sum investment
            }
        )
        .reset_index()
    )

    # Calculate percentages
    total_projects = butterfly_data["idUnico"].sum()
    total_investment = butterfly_data["investimentoTotal"].sum()

    butterfly_data["projetos_pct"] = (butterfly_data["idUnico"] / total_projects) * 100
    butterfly_data["investimento_pct"] = (
        butterfly_data["investimentoTotal"] / total_investment
    ) * 100

    # Sort by number of projects (descending)
    butterfly_data = butterfly_data.sort_values("idUnico", ascending=True)

    # Create the plot
    fig, ax = plt.subplots(figsize=(10, 6))

    # Create horizontal positions
    y_pos = np.arange(len(butterfly_data))

    # Plot left side (percentages of projects) - negative values
    bars_left = ax.barh(
        y_pos,
        -butterfly_data["projetos_pct"],
        color=palette[0],
        alpha=0.8,
        label="% dos projetos",
    )

    # Plot right side (percentages of investment) - positive values
    bars_right = ax.barh(
        y_pos,
        butterfly_data["investimento_pct"],
        color=palette[1],
        alpha=0.8,
        label="% do investimento",
    )

    # Customize the plot
    ax.set_yticks(y_pos)
    ax.set_yticklabels(butterfly_data["clusterNome"])
    ax.set_xlabel("# Projetos | Investimento")
    ax.set_title(
        "Comparação: Distribuição de Projetos vs Investimento por Grupo",
        fontsize=14,
        fontweight="bold",
        pad=20,
    )

    # Add value labels on bars
    for i, (proj, inv, proj_pct, inv_pct) in enumerate(
        zip(
            butterfly_data["idUnico"],
            butterfly_data["investimentoTotal"],
            butterfly_data["projetos_pct"],
            butterfly_data["investimento_pct"],
        )
    ):
        # Left side labels (projects)
        ax.text(
            -proj_pct - 1,
            i,
            f"{int(proj)}\n({proj_pct:.1f}%)",
            ha="right",
            va="center",
            fontsize=9,
        )

        # Right side labels (investment)
        ax.text(
            inv_pct + 1,
            i,
            f"R$ {inv/1e6:,.0f}M\n({inv_pct:.1f}%)",
            ha="left",
            va="center",
            fontsize=9,
        )

    # Add vertical line at zero
    ax.axvline(x=0, color="black", linewidth=0.8, alpha=0.5)

    # Center the plot by using same scale on both sides
    max_pct = max(
        butterfly_data["projetos_pct"].max(), butterfly_data["investimento_pct"].max()
    )
    ax.set_xlim(-max_pct * 1.3, max_pct * 1.3)

    # Add legend
    # ax.legend(loc="upper right", bbox_to_anchor=(1, 1))

    # Remove top and right spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    plt.tight_layout()
    plt.show()


create_butterfly_plot(df, palette)

plot_embeddings_interactive(df, dimension_names, hue="cluster")

plot_bim_pie(df, palette)

plot_investment_ranges(df, palette)

df['investimentoFaixa'].cat.ordered

True

m = create_investment_map(df)
m

create_butterfly_plot(df, palette)

	NaN Values	% NaN
dataFinalEfetiva	707	99.3
dataInicialEfetiva	690	96.9
observacoesPertinentes	606	85.1
qdtEmpregosGerados	574	80.6
populacaoBeneficiada	572	80.3
descPopulacaoBeneficiada	568	79.8
naturezaOutras	522	73.3
descPlanoNacionalPoliticaVinculado	464	65.2
cep	365	51.3
endereco	337	47.3
isModeladaPorBim	216	30.3

Métrica	Valor
Investimento mínimo	R$ 4,7 mil
Investimento máximo	R$ 840 milhões
Mediana	R$ 2,1 milhões
50% dos projetos	Entre R$ 668 mil e 8,4 milhões

	idUnico	descricao	duracaoPrevistaDias	investimentoTotal_k	residuos
287	46882.53-02	CONSTRUÇÃO DE UNIDADE DE ATENÇÃO ESPECIALIZADA...	1.0	2,493,000.00	8.1
374	43724.53-06	Contratação de empresa do ramo de engenharia/a...	1081.0	4,684.45	-7.1
117	557.53-69	Projeto Estratégico do SISFRON, cujo objeto é ...	579.0	839,664,954.32	5.8
69	10613.53-07	Prestação de serviços de disponibilização de a...	1799.0	47,700.00	-5.4
646	5223.53-75	ELABORAÇÃO DE PROJETOS BÁSICO E EXECUTIVO DE P...	789.0	20,305.41	-5.2
516	39020.52-43	execução das obras de adequação de capacidade,...	629.0	352,994,609.29	4.9
549	28451.53-40	ADEQUACAO DE TRECHO RODOVIARIO - ENTRONCAMENTO...	755.0	359,130,057.13	4.6
298	1729.53-04	Prestação de serviços de elaboração de Antepro...	359.0	19,560.00	-4.2
420	57958.53-86	Celebrar parcerias referentes ao PRONER	364.0	20,000.00	-4.2
389	42995.53-70	Contratação de empresa especializada em arquit...	352.0	21,560.99	-4.1

ID	Descrição	Duração	Investimento
46882.53-02	CONSTRUÇÃO DE UNIDADE DE ATENÇÃO ESPECIALIZADA...	1 dia	R$ 2.493.000,00
43724.53-06	Contratação de empresa do ramo de engenharia/a...	3 anos	R$ 4.684,45
557.53-69	Projeto Estratégico do SISFRON...	1.6 anos	R$ 839.664.954,32
10613.53-07	Prestação de serviços de disponibilização de a...	4.9 anos	R$ 47.700,00
5223.53-75	ELABORAÇÃO DE PROJETOS BÁSICO E EXECUTIVO DE P...	2.2 anos	R$ 20.305,41
39020.52-43	execução das obras de adequação de capacidade,...	1.7 anos	R$ 352.994.609,29
28451.53-40	ADEQUACAO DE TRECHO RODOVIARIO - ENTRONCAMENTO...	2.1 anos	R$ 359.130.057,13

Grupo	# Projetos	% Projetos	Investimento	Percentual
Engenharia civil	224	31.6%	R$ 2,116M	25.2%
Educação superior	92	13.0%	R$ 339M	4.0%
Educação básica	85	12.0%	R$ 38M	0.5%
Contratação	81	11.4%	R$ 4,059M	48.3%
Transporte	67	9.5%	R$ 492M	5.9%
Outros	60	8.5%	R$ 1,253M	14.9%
Obra de quadra escolar	26	3.7%	R$ 4M	0.0%
Serviço social	26	3.7%	R$ 26M	0.3%
Saúde	24	3.4%	R$ 27M	0.3%
Energia	23	3.2%	R$ 48M	0.6%

Análise de projetos de infraestrutura com investimento federal no Distrito Federal¶

1. Extração de dados¶

2. Tratamento de dados¶

2.1 Duplicatas¶

2.2 Valores ausentes¶

2.3 Limpeza específica de dados¶

Encoding¶

Registros incorretos¶

Endereços e CEP¶

2.4 Tipagem¶

2.5 Colunas aninhadas¶

2.5 Engenharia de características¶

textoTotal¶

investimentoTotal¶

investimentoFaixa¶

isInvestimentoSimbolico¶

duracaoPrevista¶

Latitude e longitude¶

Carregamento de dados¶

3. Análise de dados¶

Informações básicas¶

Qualidade¶

Categorias¶

Investimento¶

Datas¶

Dados geográficos (GIS)¶

Processamento de linguagem natural (NLP)¶

4. Resumo executivo¶

Panorama¶

Perfil dos Projetos¶

Qualidade¶

Distribuição Geográfica¶

Concentração de Investimentos¶

Limitações¶

	idUnico	nome	cep	endereco	descricao	funcaoSocial	metaGlobal	dataInicialPrevista	dataFinalPrevista	dataInicialEfetiva	dataFinalEfetiva	dataCadastro	especie	natureza	naturezaOutras	situacao	descPlanoNacionalPoliticaVinculado	uf	qdtEmpregosGerados	descPopulacaoBeneficiada	populacaoBeneficiada	observacoesPertinentes	isModeladaPorBim	dataSituacao	tomadores	executores	repassadores	eixos	tipos	subTipos	fontesDeRecurso
0	1828.53-16	SISEG - Sistema integrado de segurança	1		implantação dos sistemas de monitoramento por ...	implantação dos sistemas de monitoramento por ...	Modernização do sistema de segurança do Banco ...	2019-02-04	2024-02-06	None	None	2021-05-12	Recuperação	Projeto		Cadastrada	None	DF	None	None	None	None	None	2021-05-12	[]	[{'nome': 'BANCO CENTRAL DO BRASIL', 'codigo':...	[]	[{'id': 1, 'descricao': 'Administrativo'}]	[{'id': 5, 'descricao': 'Administrativo', 'idE...	[{'id': 59, 'descricao': 'Obras em Imóveis de ...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
1	1711.53-18	Reforma do espelho d'água do Edifício-Sede do ...	1		Reforma do espelho d'água do Edifício-Sede do ...	Reestabelecer a impermeabilização do espelho d...	Recuperação do sistema de impermeabilização e ...	2021-01-04	2021-05-18	None	None	2021-05-04	Reforma	Obra		Cadastrada	None	DF	None	None	None	None	None	2021-05-04	[{'nome': 'BANCO CENTRAL DO BRASIL', 'codigo':...	[{'nome': 'BANCO CENTRAL DO BRASIL', 'codigo':...	[]	[{'id': 1, 'descricao': 'Administrativo'}]	[{'id': 5, 'descricao': 'Administrativo', 'idE...	[{'id': 59, 'descricao': 'Obras em Imóveis de ...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
2	1989.53-01	Elaboração de projetos arquitetônicos e de eng...	1	Campus Universitário Darcy Ribeiro, Brasília - DF	Contratação de empresa especializada para elab...	Obras para unidades acadêmicas de ensino e pes...	Elaboração de projetos de engenharia e Arquite...	2021-05-18	2021-09-29	None	None	2021-05-17	Construção	Projeto	None	Cadastrada	None	DF	None	None	None	None	None	2021-05-17	[]	[{'nome': 'FUNDACAO UNIVERSIDADE DE BRASILIA',...	[{'nome': 'FUNDACAO UNIVERSIDADE DE BRASILIA',...	[{'id': 1, 'descricao': 'Administrativo'}]	[{'id': 8, 'descricao': 'Educação', 'idEixo': 1}]	[{'id': 46, 'descricao': 'Instituições Federai...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
3	2004.53-58	SISEG - Sistema integrado de segurança	1		implantação dos sistemas de monitoramento por ...	implantação dos sistemas de monitoramento por ...	Modernização do sistema de segurança do Banco ...	2019-02-04	2024-02-06	None	None	2021-05-18	Recuperação	Projeto		Cadastrada	None	DF	None	None	None	None	None	2021-05-18	[{'nome': 'BANCO CENTRAL DO BRASIL-ORC.FISCAL/...	[{'nome': 'BANCO CENTRAL DO BRASIL', 'codigo':...	[]	[{'id': 1, 'descricao': 'Administrativo'}]	[{'id': 5, 'descricao': 'Administrativo', 'idE...	[{'id': 59, 'descricao': 'Obras em Imóveis de ...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
4	2617.53-02	CENTRO DE FORMAÇÃO TECNOLOGICA - CFT - CAMPUS ...	1	Área Especial n° 01, Quadra 16, Cidade do Auto...	construção um galpão com pé direito alto, que ...	atender a comunidade acadêmica do Campus Estru...	aumentar a oferta de cursos técnicos/tecnológi...	2020-12-01	2021-10-27	None	None	2021-06-18	Construção	Obra		Cadastrada	triplicar as matrículas da educação profission...	DF	None	None	None	None	None	2021-06-18	[{'nome': 'INSTITUTO FED. ED. CIENCIA E TEC. D...	[{'nome': 'INSTITUTO FED. ED. CIENCIA E TEC. D...	[{'nome': 'MINISTÉRIO DA EDUCAÇÃO', 'codigo': ...	[{'id': 4, 'descricao': 'Social'}]	[{'id': 46, 'descricao': 'Educação', 'idEixo':...	[{'id': 84, 'descricao': 'Educação', 'idTipo':...	[{'origem': 'Federal', 'valorInvestimentoPrevi...

	idUnico	nome	cep	endereco	descricao	funcaoSocial	metaGlobal	dataInicialPrevista	dataFinalPrevista	dataInicialEfetiva	dataFinalEfetiva	dataCadastro	especie	natureza	naturezaOutras	situacao	descPlanoNacionalPoliticaVinculado	uf	qdtEmpregosGerados	descPopulacaoBeneficiada	populacaoBeneficiada	observacoesPertinentes	isModeladaPorBim	dataSituacao	tomadores	executores	repassadores	eixos	tipos	subTipos	fontesDeRecurso
79	4543.53-79	202111-22-Ronald 1	70067-901	2021122-Ronald - Endereço Completo	2021122-Ronald - Descrição do Projeto	2021122-Ronald - Descrição Funç]ap Social	20211122-Ronald - Descrição Meta Global	2021-12-10	2021-12-10	None	None	2021-11-17	Fabricação	Projeto	outros	Cadastrada	None	DF	None	None	None	None	None	2021-11-17	[{'nome': 'FUND.UNIVERSIDADE FEDERAL VALE SAO ...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'id': 4, 'descricao': 'Social'}]	[{'id': 45, 'descricao': 'Desenvolvimento', 'i...	[{'id': 38, 'descricao': 'Saneamento', 'idTipo...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
80	4540.53-87	202111-22-Ronald 1	70067-901	2021122-Ronald - Endereço Completo	2021122-Ronald - Descrição do Projeto	2021122-Ronald - Descrição Funç]ap Social	20211122-Ronald - Descrição Meta Global	2021-12-10	2021-12-10	None	None	2021-11-17	Reforma	Obra	outros	Cadastrada	None	DF	None	None	None	None	None	2021-11-17	[{'nome': 'FUND.UNIVERSIDADE FEDERAL VALE SAO ...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'id': 4, 'descricao': 'Social'}]	[{'id': 45, 'descricao': 'Desenvolvimento', 'i...	[{'id': 38, 'descricao': 'Saneamento', 'idTipo...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
82	4542.53-01	202111-22-Ronald 1	70067-901	2021122-Ronald - Endereço Completo	2021122-Ronald - Descrição do Projeto	2021122-Ronald - Descrição Funç]ap Social	20211122-Ronald - Descrição Meta Global	2021-12-10	2021-12-10	None	None	2021-11-17	Reforma	Projeto de Investimento em Infraestrutura	outros	Cadastrada	None	DF	None	None	None	None	None	2021-11-17	[{'nome': 'FUND.UNIVERSIDADE FEDERAL VALE SAO ...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'id': 4, 'descricao': 'Social'}]	[{'id': 45, 'descricao': 'Desenvolvimento', 'i...	[{'id': 38, 'descricao': 'Saneamento', 'idTipo...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
83	4544.53-36	202111-22-Ronald 1	70067-901	2021122-Ronald - Endereço Completo	2021122-Ronald - Descrição do Projeto	2021122-Ronald - Descrição Funç]ap Social	20211122-Ronald - Descrição Meta Global	2021-12-10	2021-12-10	None	None	2021-11-17	Reforma	Projeto de Investimento em Infraestrutura	outros	Cadastrada	None	DF	None	None	None	None	None	2021-11-17	[{'nome': 'FUND.UNIVERSIDADE FEDERAL VALE SAO ...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'id': 4, 'descricao': 'Social'}]	[{'id': 45, 'descricao': 'Desenvolvimento', 'i...	[{'id': 38, 'descricao': 'Saneamento', 'idTipo...	[{'origem': 'Federal', 'valorInvestimentoPrevi...
205	4541.53-44	202111-22-Ronald 1	70067-901	2021122-Ronald - Endereço Completo	2021122-Ronald - Descrição do Projeto	2021122-Ronald - Descrição Funç]ap Social	20211122-Ronald - Descrição Meta Global	2021-12-10	2021-12-10	None	None	2021-11-17	Recuperação	Projeto de Investimento em Infraestrutura	outros	Cadastrada	None	DF	None	None	None	None	None	2021-11-17	[{'nome': 'FUND.UNIVERSIDADE FEDERAL VALE SAO ...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'nome': 'MINISTÉRIO DA INTEGRAÇÃO E DO DESEN...	[{'id': 4, 'descricao': 'Social'}]	[{'id': 45, 'descricao': 'Desenvolvimento', 'i...	[{'id': 38, 'descricao': 'Saneamento', 'idTipo...	[{'origem': 'Federal', 'valorInvestimentoPrevi...

	descricao	duracaoPrevistaDias	investimentoFaixa
331	CONSTRUÇÃO DE UNIDADE BÁSICA DE SAÚDE	0.0	1M-10M
326	CONSTRUÇÃO DE UNIDADE DE ATENÇÃO ESPECIALIZADA...	0.0	1M-10M
9	Obra no Batalhão Escola e Pronto Emprego para ...	0.0	10M-100M
304	CONSTRUÇÃO DE UNIDADE DE ATENÇÃO ESPECIALIZADA...	0.0	1M-10M
287	CONSTRUÇÃO DE UNIDADE DE ATENÇÃO ESPECIALIZADA...	1.0	1M-10M

Grupo	Categoria
0	Cobertura de quadra escolar
1	Educação básica
2	Saúde
3	Energia
4	Educação superior
5	Contratação
6	Engenharia civil
7	Transporte
8	Serviço social
-1	Outros

	nome
codigo
25280	BANCO CENTRAL DO BRASIL

	descricao
id
1	Administrativo

	descricao	idEixo
id
5	Administrativo	1