Web Scraper em Zig — Tutorial Passo a Passo

Neste tutorial, vamos construir um web scraper em Zig que faz requisições HTTP, extrai informações de páginas HTML e salva os resultados. Este projeto explora o cliente HTTP da stdlib de Zig, parsing de texto e manipulação de arquivos.

O Que Vamos Construir

Nosso scraper vai:

Fazer requisições HTTP/HTTPS para URLs arbitrárias
Extrair títulos, links e texto de páginas HTML
Seguir links até uma profundidade configurável
Salvar resultados em formato estruturado
Respeitar limites de taxa (rate limiting) e timeouts

Pré-requisitos

Zig 0.13+ instalado (guia de instalação)
Conhecimentos básicos de HTTP e HTML
Familiaridade com alocadores de memória em Zig

Passo 1: Estrutura do Projeto

mkdir web-scraper
cd web-scraper
zig init

Passo 2: Cliente HTTP Básico

const std = @import("std");
const http = std.http;
const mem = std.mem;
const Uri = std.Uri;
const Allocator = std.mem.Allocator;

/// Resultado de uma requisição HTTP.
const RespostaHTTP = struct {
    status: http.Status,
    corpo: []const u8,
    content_type: ?[]const u8,
    allocator: Allocator,

    pub fn deinit(self: *RespostaHTTP) void {
        self.allocator.free(self.corpo);
        if (self.content_type) |ct| self.allocator.free(ct);
    }
};

/// Erros possíveis durante o scraping.
const ScraperError = error{
    URLInvalida,
    ConexaoFalhou,
    TimeoutExcedido,
    RespostaMuitoGrande,
    HTMLInvalido,
};

/// Faz uma requisição HTTP GET e retorna o corpo da resposta.
/// Usa o cliente HTTP da stdlib de Zig, que suporta HTTP/1.1 e TLS.
fn fazerRequisicao(allocator: Allocator, url: []const u8) !RespostaHTTP {
    var client = http.Client{ .allocator = allocator };
    defer client.deinit();

    // Parse da URL
    const uri = Uri.parse(url) catch return ScraperError.URLInvalida;

    // Prepara os headers
    var headers = http.Headers{ .allocator = allocator };
    defer headers.deinit();
    try headers.append("User-Agent", "ZigScraper/1.0");
    try headers.append("Accept", "text/html,text/plain");

    // Faz a requisição
    var req = try client.open(.GET, uri, headers, .{});
    defer req.deinit();

    try req.send(.{});
    try req.wait();

    // Lê o corpo (limite de 1MB)
    const max_size = 1024 * 1024;
    const corpo = try req.reader().readAllAlloc(allocator, max_size);

    return RespostaHTTP{
        .status = req.status,
        .corpo = corpo,
        .content_type = null,
        .allocator = allocator,
    };
}

Passo 3: Parser HTML Simplificado

Implementamos um parser HTML minimalista que extrai tags específicas. Não é um parser HTML completo, mas é suficiente para scraping básico.

/// Representa um elemento extraído do HTML.
const ElementoHTML = struct {
    tag: []const u8,
    conteudo: []const u8,
    atributos: [8]Atributo,
    num_atributos: usize,

    const Atributo = struct {
        nome: []const u8,
        valor: []const u8,
    };

    pub fn atributo(self: *const ElementoHTML, nome: []const u8) ?[]const u8 {
        for (self.atributos[0..self.num_atributos]) |attr| {
            if (mem.eql(u8, attr.nome, nome)) return attr.valor;
        }
        return null;
    }
};

/// Encontra todas as ocorrências de uma tag no HTML.
/// Abordagem: busca por <tag e extrai até </tag> ou />.
/// Isso é uma simplificação — um parser real usaria uma árvore DOM.
fn encontrarTags(
    html: []const u8,
    tag: []const u8,
    resultados: []ElementoHTML,
) usize {
    var count: usize = 0;
    var pos: usize = 0;

    while (pos < html.len and count < resultados.len) {
        // Procura pela tag de abertura
        const tag_abertura = buscarTag(html[pos..], tag) orelse break;
        const inicio_abs = pos + tag_abertura.inicio;
        const fim_tag = pos + tag_abertura.fim;

        // Extrai atributos da tag de abertura
        var elem = ElementoHTML{
            .tag = tag,
            .conteudo = "",
            .atributos = undefined,
            .num_atributos = 0,
        };

        extrairAtributos(html[inicio_abs..fim_tag], &elem);

        // Procura pelo conteúdo (entre abertura e fechamento)
        if (fim_tag < html.len) {
            if (encontrarFechamento(html[fim_tag..], tag)) |fim_conteudo| {
                elem.conteudo = html[fim_tag .. fim_tag + fim_conteudo];
            }
        }

        resultados[count] = elem;
        count += 1;
        pos = fim_tag;
    }

    return count;
}

/// Busca por uma tag de abertura no HTML.
const PosicaoTag = struct { inicio: usize, fim: usize };

fn buscarTag(html: []const u8, tag: []const u8) ?PosicaoTag {
    var i: usize = 0;
    while (i < html.len) : (i += 1) {
        if (html[i] == '<') {
            // Verifica se é a tag procurada
            const resto = html[i + 1 ..];
            if (resto.len >= tag.len and
                mem.eql(u8, resto[0..tag.len], tag) and
                (resto.len == tag.len or resto[tag.len] == ' ' or
                resto[tag.len] == '>' or resto[tag.len] == '/'))
            {
                // Encontra o > de fechamento
                if (mem.indexOfScalar(u8, html[i..], '>')) |fim_rel| {
                    return PosicaoTag{
                        .inicio = i,
                        .fim = i + fim_rel + 1,
                    };
                }
            }
        }
    }
    return null;
}

/// Encontra a tag de fechamento correspondente.
fn encontrarFechamento(html: []const u8, tag: []const u8) ?usize {
    var needle: [64]u8 = undefined;
    const needle_len = std.fmt.bufPrint(&needle, "</{s}>", .{tag}) catch return null;
    const search = needle[0..needle_len.len];

    // Busca case-insensitive simplificada
    var i: usize = 0;
    while (i + search.len <= html.len) : (i += 1) {
        if (mem.eql(u8, html[i .. i + search.len], search)) {
            return i;
        }
        // Tenta minúsculas
        var match = true;
        for (html[i .. i + search.len], search) |h, s| {
            const h_lower = if (h >= 'A' and h <= 'Z') h + 32 else h;
            const s_lower = if (s >= 'A' and s <= 'Z') s + 32 else s;
            if (h_lower != s_lower) {
                match = false;
                break;
            }
        }
        if (match) return i;
    }
    return null;
}

/// Extrai atributos de uma tag HTML.
fn extrairAtributos(tag_html: []const u8, elem: *ElementoHTML) void {
    // Encontra o primeiro espaço (início dos atributos)
    const inicio_attrs = mem.indexOfScalar(u8, tag_html, ' ') orelse return;
    var pos = inicio_attrs + 1;

    while (pos < tag_html.len and elem.num_atributos < 8) {
        // Pula espaços
        while (pos < tag_html.len and tag_html[pos] == ' ') pos += 1;
        if (pos >= tag_html.len or tag_html[pos] == '>' or tag_html[pos] == '/') break;

        // Nome do atributo
        const inicio_nome = pos;
        while (pos < tag_html.len and tag_html[pos] != '=' and tag_html[pos] != ' ' and tag_html[pos] != '>') pos += 1;
        const nome = tag_html[inicio_nome..pos];

        // Valor do atributo
        if (pos < tag_html.len and tag_html[pos] == '=') {
            pos += 1;
            if (pos < tag_html.len and (tag_html[pos] == '"' or tag_html[pos] == '\'')) {
                const aspas = tag_html[pos];
                pos += 1;
                const inicio_valor = pos;
                while (pos < tag_html.len and tag_html[pos] != aspas) pos += 1;
                const valor = tag_html[inicio_valor..pos];
                if (pos < tag_html.len) pos += 1;

                elem.atributos[elem.num_atributos] = .{ .nome = nome, .valor = valor };
                elem.num_atributos += 1;
            }
        }
    }
}

Passo 4: Extração de Dados

/// Resultado do scraping de uma página.
const ResultadoScraping = struct {
    titulo: [256]u8,
    titulo_len: usize,
    links: [100]Link,
    num_links: usize,

    const Link = struct {
        href: []const u8,
        texto: []const u8,
    };

    pub fn tituloStr(self: *const ResultadoScraping) []const u8 {
        return self.titulo[0..self.titulo_len];
    }
};

/// Extrai informações relevantes de uma página HTML.
fn extrairDados(html: []const u8) ResultadoScraping {
    var resultado = ResultadoScraping{
        .titulo = undefined,
        .titulo_len = 0,
        .links = undefined,
        .num_links = 0,
    };

    // Extrair título
    var titulos: [1]ElementoHTML = undefined;
    const num_titulos = encontrarTags(html, "title", &titulos);
    if (num_titulos > 0) {
        const conteudo = titulos[0].conteudo;
        const len = @min(conteudo.len, resultado.titulo.len);
        @memcpy(resultado.titulo[0..len], conteudo[0..len]);
        resultado.titulo_len = len;
    }

    // Extrair links
    var links_html: [100]ElementoHTML = undefined;
    const num_links = encontrarTags(html, "a", &links_html);

    var i: usize = 0;
    while (i < num_links and resultado.num_links < 100) : (i += 1) {
        if (links_html[i].atributo("href")) |href| {
            resultado.links[resultado.num_links] = .{
                .href = href,
                .texto = links_html[i].conteudo,
            };
            resultado.num_links += 1;
        }
    }

    return resultado;
}

/// Remove tags HTML de um texto, deixando apenas o conteúdo.
fn stripHTML(html: []const u8, buf: []u8) []const u8 {
    var pos: usize = 0;
    var dentro_tag = false;

    for (html) |c| {
        if (c == '<') {
            dentro_tag = true;
        } else if (c == '>') {
            dentro_tag = false;
        } else if (!dentro_tag and pos < buf.len) {
            buf[pos] = c;
            pos += 1;
        }
    }

    return buf[0..pos];
}

Passo 5: Interface CLI e Saída

/// Exibe os resultados do scraping formatados.
fn exibirResultados(resultado: *const ResultadoScraping, url: []const u8, writer: anytype) !void {
    try writer.print(
        \\
        \\  ==========================================
        \\     RESULTADO DO SCRAPING
        \\  ==========================================
        \\  URL: {s}
        \\
    , .{url});

    if (resultado.titulo_len > 0) {
        try writer.print("  Titulo: {s}\n", .{resultado.tituloStr()});
    } else {
        try writer.print("  Titulo: (nao encontrado)\n", .{});
    }

    try writer.print("\n  Links encontrados: {d}\n", .{resultado.num_links});

    const max_exibir = @min(resultado.num_links, 20);
    var i: usize = 0;
    while (i < max_exibir) : (i += 1) {
        const link = resultado.links[i];
        const texto = if (link.texto.len > 50) link.texto[0..50] else link.texto;
        try writer.print("    [{d:>2}] {s}\n", .{ i + 1, link.href });
        if (texto.len > 0) {
            try writer.print("         \"{s}\"\n", .{texto});
        }
    }

    if (resultado.num_links > max_exibir) {
        try writer.print("    ... e mais {d} links\n", .{resultado.num_links - max_exibir});
    }
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const stdout = std.io.getStdOut().writer();
    const stdin = std.io.getStdIn().reader();

    try stdout.print(
        \\
        \\  ==========================================
        \\     WEB SCRAPER - Zig
        \\  ==========================================
        \\
    , .{});

    var buf: [1024]u8 = undefined;

    while (true) {
        try stdout.print(
            \\
            \\  [1] Scrape de URL
            \\  [2] Extrair links
            \\  [3] Extrair texto (strip HTML)
            \\  [4] Sair
            \\
            \\  Opcao:
        , .{});

        const opcao_raw = stdin.readUntilDelimiterOrEof(&buf, '\n') catch continue orelse break;
        const opcao = mem.trim(u8, opcao_raw, " \t\r\n");

        if (mem.eql(u8, opcao, "4")) break;

        if (mem.eql(u8, opcao, "1") or mem.eql(u8, opcao, "2") or mem.eql(u8, opcao, "3")) {
            try stdout.print("\n  URL: ", .{});
            const url_raw = stdin.readUntilDelimiterOrEof(&buf, '\n') catch continue orelse continue;
            const url = mem.trim(u8, url_raw, " \t\r\n");

            try stdout.print("  Buscando {s}...\n", .{url});

            var resposta = fazerRequisicao(allocator, url) catch |err| {
                try stdout.print("  Erro ao buscar URL: {any}\n", .{err});
                continue;
            };
            defer resposta.deinit();

            try stdout.print("  Status: {d}\n", .{@intFromEnum(resposta.status)});
            try stdout.print("  Tamanho: {d} bytes\n", .{resposta.corpo.len});

            if (mem.eql(u8, opcao, "3")) {
                var strip_buf: [4096]u8 = undefined;
                const texto = stripHTML(resposta.corpo, &strip_buf);
                try stdout.print("\n  --- Texto ---\n{s}\n", .{texto});
            } else {
                const resultado = extrairDados(resposta.corpo);
                try exibirResultados(&resultado, url, stdout);
            }
        } else {
            try stdout.print("  Opcao invalida.\n", .{});
        }
    }

    try stdout.print("\n  Ate logo!\n", .{});
}

Testes

test "buscar tag simples" {
    const html = "<title>Teste</title>";
    var resultados: [1]ElementoHTML = undefined;
    const n = encontrarTags(html, "title", &resultados);
    try std.testing.expectEqual(@as(usize, 1), n);
    try std.testing.expectEqualStrings("Teste", resultados[0].conteudo);
}

test "extrair atributo href" {
    const html = "<a href=\"https://zig.dev\">Link</a>";
    var resultados: [1]ElementoHTML = undefined;
    const n = encontrarTags(html, "a", &resultados);
    try std.testing.expectEqual(@as(usize, 1), n);
    try std.testing.expectEqualStrings("https://zig.dev", resultados[0].atributo("href").?);
}

test "strip HTML" {
    const html = "<p>Hello <b>World</b></p>";
    var buf: [256]u8 = undefined;
    const texto = stripHTML(html, &buf);
    try std.testing.expectEqualStrings("Hello World", texto);
}

test "multiplos links" {
    const html = "<a href=\"/a\">A</a><a href=\"/b\">B</a>";
    var resultados: [10]ElementoHTML = undefined;
    const n = encontrarTags(html, "a", &resultados);
    try std.testing.expectEqual(@as(usize, 2), n);
}

Compilando e Executando

zig build test
zig build run

Conceitos Aprendidos

Cliente HTTP com std.http.Client
Parsing de HTML sem dependências externas
Extração de atributos de tags
Gerenciamento de memória com allocator
Tratamento de erros de rede

Próximos Passos

Explore a documentação HTTP da stdlib
Aprenda sobre alocadores para gerenciamento de memória avançado
Construa o próximo projeto: Parser de Configuração JSON

Web Scraper em Zig — Tutorial Passo a Passo

Web Scraper em Zig — Tutorial Passo a Passo

O Que Vamos Construir

Pré-requisitos

Passo 1: Estrutura do Projeto

Passo 2: Cliente HTTP Básico

Passo 3: Parser HTML Simplificado

Passo 4: Extração de Dados

Passo 5: Interface CLI e Saída

Testes

Compilando e Executando

Conceitos Aprendidos

Próximos Passos

Explore Mais

Continue aprendendo Zig

Web Scraper em Zig — Tutorial Passo a Passo

O Que Vamos Construir

Pré-requisitos

Passo 1: Estrutura do Projeto

Passo 2: Cliente HTTP Básico

Passo 3: Parser HTML Simplificado

Passo 4: Extração de Dados

Passo 5: Interface CLI e Saída

Testes

Compilando e Executando

Conceitos Aprendidos

Próximos Passos

Explore Mais

Artigos relacionados

Servidor HTTP de Arquivos em Zig — Tutorial Passo a Passo

URL Shortener em Zig — Tutorial Passo a Passo

Agendador de Tarefas em Zig — Tutorial Passo a Passo

Continue aprendendo Zig