[golang爬虫]instagram 图片扒取

链接一行一个 放link.txt, 怕爬太快被禁ip 没敢搞花活

本来打算js获取(内容不多) go直接爬, 发现页面图片是动态的, 随手记录一下, 没有优化

爬虫

package main

import (
    "bufio"
    "context"
    "fmt"
    "io"
    "log"
    "math"
    "net/http"
    "net/url"
    "os"
    "path/filepath"
    "strings"
    "time"
    "unicode"

    "github.com/chromedp/chromedp"
    "golang.org/x/net/proxy"
)

func readLines(filePath string) ([]string, error) {
    file, err := os.Open(filePath)
    if err != nil {
        return nil, err
    }
    defer file.Close()

    var lines []string
    scanner := bufio.NewScanner(file)
    for scanner.Scan() {
        lines = append(lines, scanner.Text())
    }
    return lines, scanner.Err()
}

func downloadInsImage(client *http.Client, imgUrl, folder, filename string) error {
    resp, err := client.Get(imgUrl)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("failed to download image: %s", imgUrl)
    }

    out, err := os.Create(filepath.Join(folder, filename))
    if err != nil {
        return err
    }
    defer out.Close()

    _, err = io.Copy(out, resp.Body)
    return err
}

func getDatas(ctx context.Context, url string) (string, []string, error) {
    var title string
    var imageLinks []string

    err := chromedp.Run(ctx,
        chromedp.Navigate(url),

        // 点击加载更多图片按钮直到不存在
        chromedp.ActionFunc(func(ctx context.Context) error {
            time.Sleep(3 * time.Second)
            for {
                time.Sleep(500 * time.Millisecond)
                var buttonExists bool
                err := chromedp.Evaluate(`document.querySelector('div > button._afxw._al46._al47') !== null`, &buttonExists).Do(ctx)
                if err != nil {
                    return err
                }
                if !buttonExists {
                    break
                }
                err = chromedp.Click("div > button._afxw._al46._al47", chromedp.NodeVisible).Do(ctx)
                if err != nil {
                    return err
                }

                time.Sleep(250 * time.Millisecond)

                var imgs []string

                if err := chromedp.Run(ctx, chromedp.Evaluate(`Array.from(document.querySelectorAll("._aap0 ._aagv img")).map(img => img.src)`, &imgs)); err != nil {
                    return nil
                }

                imageLinks = append(imageLinks, imgs...)
            }
            return nil
        }),

        // 获取页面标题
        chromedp.Title(&title),
    )
    if err != nil {
        return "", nil, err
    }

    uniqueLinks := removeDuplicates(imageLinks)
    fmt.Printf("共发现: %v张图片\n", len(uniqueLinks))
    return title, uniqueLinks, nil
}

func removeDuplicates(imageLinks []string) []string {
    encountered := map[string]bool{}
    result := []string{}

    for v := range imageLinks {
        if encountered[imageLinks[v]] == false {
            encountered[imageLinks[v]] = true
            result = append(result, imageLinks[v])
        }
    }

    return result
}

// 将字符串转换为合法的文件名
func stringToFileName(input string) string {
    illegalChars := []rune{'\\', '/', ':', '*', '?', '"', '<', '>', '|'}
    illegalCharMap := make(map[rune]bool, len(illegalChars))
    for _, char := range illegalChars {
        illegalCharMap[char] = true
    }

    // 过滤掉不允许的字符
    var output strings.Builder
    for _, char := range input {
        if !illegalCharMap[char] && !unicode.IsControl(char) {
            output.WriteRune(char)
        } else {
            output.WriteRune('_') // 用下划线替换非法字符
        }
    }

    out := output.String()

    // 文件夹最大名字50字符
    minLen := math.Min(float64(len(out)), float64(50))

    return output.String()[:int(minLen)]
}

func main() {
    output := `E:\Scripting\Program-Learning\test\download`

    // 设置代理
    proxyURL, err := url.Parse("socks5://127.0.0.1:10808")
    if err != nil {
        log.Fatalf("failed to parse proxy URL: %v", err)
    }

    // 创建一个Dialer
    dialer, err := proxy.FromURL(proxyURL, proxy.Direct)
    if err != nil {
        log.Fatalf("failed to create proxy dialer: %v", err)
    }

    // 创建一个HTTP客户端,使用Dialer
    httpTransport := &http.Transport{
        Dial: dialer.Dial,
    }
    client := &http.Client{
        Transport: httpTransport,
    }

    // 读取链接文件
    links, err := readLines("links.txt")
    if err != nil {
        log.Fatalf("Error reading links file: %v", err)
    }

    if len(links) == 0 {
        log.Println("No links found in the file.")
        return
    }

    // 创建Chromedp上下文
    ctx, cancel := chromedp.NewExecAllocator(
        context.Background(),
        append(chromedp.DefaultExecAllocatorOptions[:],
            chromedp.Flag("headless", false), // 设置 headless 模式为 false

            chromedp.ProxyServer("socks5://127.0.0.1:10808"),
        )...,
    )
    defer cancel()

    ctx, cancel = chromedp.NewContext(ctx, chromedp.WithLogf(log.Printf))
    defer cancel()

    ctx, cancel = context.WithTimeout(ctx, 30*time.Hour)
    defer cancel()

    index := 1

    // 下载每个链接中的图片
    for _, link := range links {

        folderName, imgsUrl, err := getDatas(ctx, link)
        if err != nil {
            log.Printf("Error getting data from HTML: %v", err)
            continue
        }

        folderName = stringToFileName(folderName)
        fmt.Printf("link: %v\n", link)

        targetFolder := filepath.Join(output, folderName)

        // 创建文件夹
        err = os.MkdirAll(targetFolder, os.ModePerm)
        if err != nil {
            log.Printf("Error creating folder: %v", err)
            continue
        }

        for _, imgUrl := range imgsUrl {
            filename := fmt.Sprintf("image_%d.jpg", index)
            err = downloadInsImage(client, imgUrl, targetFolder, filename)
            if err != nil {
                log.Printf("Error downloading image: %d", index)
            } else {
                fmt.Printf("Downloaded %d\n", index)
            }

            index += 1
        }
    }
}

给TA充电
共{{data.count}}人
人已充电
编程

[npm]自动升级包

2024-7-8 5:30:03

AEAE表达式

[表达式]使用repeat创建动作

2020-7-22 1:12:50

0 条回复 A文章作者 M管理员
    暂无讨论,说说你的看法吧
个人中心
今日签到
搜索