네이버 카페 크롤링 자동화, 키워드 수집부터 알림까지 전체 코드 공개

네이버 카페에서 원하는 키워드가 포함된 게시글만 자동으로 수집하고 싶다면 Selenium과 BeautifulSoup를 함께 활용하는 방법이 가장 효과적이에요. 특히 네이버 카페는 동적 페이지 구조와 로그인 필수 환경이라는 특성 때문에 일반적인 크롤링 방법으로는 접근이 어려워요.

macOS에서 환경 설정하기

먼저 macOS 터미널에서 필요한 라이브러리를 설치해요. Chrome 브라우저가 이미 설치되어 있다고 가정하고 진행할게요.

pip install selenium beautifulsoup4 lxml chromedriver-autoinstaller

ChromeDriver를 자동으로 관리해주는 chromedriver-autoinstaller를 사용하면 버전 호환성 문제를 쉽게 해결할 수 있어요.

기본 크롤링 코드 구현

네이버 로그인부터 카페 게시글 수집까지 전체 프로세스를 구현한 코드예요.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import chromedriver_autoinstaller
import time

# ChromeDriver 자동 설치
chromedriver_autoinstaller.install()

# Chrome 옵션 설정
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

# 드라이버 초기화
driver = webdriver.Chrome(options=options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

# 네이버 로그인
def naver_login(user_id, user_pw):
    driver.get('https://nid.naver.com/nidlogin.login')
    time.sleep(1)
    
    # ID 입력
    id_input = driver.find_element(By.ID, 'id')
    id_input.click()
    id_input.send_keys(user_id)
    
    # PW 입력
    pw_input = driver.find_element(By.ID, 'pw')
    pw_input.click()
    pw_input.send_keys(user_pw)
    
    # 로그인 버튼 클릭
    driver.find_element(By.ID, 'log.login').click()
    time.sleep(3)

키워드 필터링 크롤러 구현

카페 내에서 특정 키워드가 포함된 게시글만 수집하는 핵심 코드예요. 검색 기능을 활용해 1차 필터링을 하고, BeautifulSoup로 2차 필터링을 적용해요.

def cafe_keyword_crawler(cafe_url, search_keyword, filter_keywords):
    results = []
    
    # 카페 접속
    driver.get(cafe_url)
    time.sleep(2)
    
    # iframe 전환
    driver.switch_to.frame('cafe_main')
    
    # 검색창에 키워드 입력
    search_input = driver.find_element(By.NAME, 'query')
    search_input.clear()
    search_input.send_keys(f'"{search_keyword}"')  # 정확한 검색을 위해 쌍따옴표 사용
    search_input.submit()
    time.sleep(2)
    
    # HTML 파싱
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')  # lxml 파서로 속도 향상
    
    # 게시글 목록 추출
    articles = soup.select('a.article')
    
    # 키워드 필터링
    for article in articles:
        title = article.get_text(strip=True)
        link = article.get('href')
        
        # 복수 키워드 필터링
        if any(keyword.lower() in title.lower() for keyword in filter_keywords):
            results.append({
                'title': title,
                'link': f'https://cafe.naver.com{link}'
            })
            
            # 게시글 본문도 확인하려면
            if len(results) < 10:  # 너무 많은 요청 방지
                driver.get(f'https://cafe.naver.com{link}')
                time.sleep(1)
                driver.switch_to.frame('cafe_main')
                
                content_html = driver.page_source
                content_soup = BeautifulSoup(content_html, 'lxml')
                content = content_soup.select_one('.se-main-container')
                
                if content:
                    results[-1]['content'] = content.get_text(strip=True)[:200]
    
    return results

실시간 모니터링 시스템 구축

주기적으로 새 글을 확인하고 알림을 보내는 실시간 크롤링 시스템이에요. macOS의 경우 터미널 알림을 활용할 수 있어요.

import subprocess
from datetime import datetime
import json

def send_mac_notification(title, message):
    """macOS 알림 전송"""
    subprocess.run([
        'osascript', '-e',
        f'display notification "{message}" with title "{title}"'
    ])

def realtime_monitoring(cafe_url, keywords, interval=300):
    """실시간 모니터링 (5분 간격)"""
    seen_posts = set()
    
    # 이전 데이터 로드
    try:
        with open('seen_posts.json', 'r') as f:
            seen_posts = set(json.load(f))
    except FileNotFoundError:
        pass
    
    while True:
        try:
            print(f"[{datetime.now()}] 크롤링 시작")
            
            # 키워드별로 검색
            new_posts = []
            for keyword in keywords:
                posts = cafe_keyword_crawler(cafe_url, keyword, keywords)
                
                for post in posts:
                    post_id = post['link']
                    if post_id not in seen_posts:
                        new_posts.append(post)
                        seen_posts.add(post_id)
            
            # 새 글 발견시 알림
            if new_posts:
                for post in new_posts:
                    send_mac_notification(
                        "새 글 발견!",
                        f"{post['title'][:50]}..."
                    )
                    print(f"새 글: {post['title']}")
                
                # 데이터 저장
                with open('new_posts.json', 'a') as f:
                    for post in new_posts:
                        json.dump(post, f, ensure_ascii=False)
                        f.write('\n')
            
            # seen_posts 저장
            with open('seen_posts.json', 'w') as f:
                json.dump(list(seen_posts), f)
            
            print(f"[{datetime.now()}] 대기 중...")
            time.sleep(interval)
            
        except Exception as e:
            print(f"에러 발생: {e}")
            time.sleep(60)

성능 최적화 전략

크롤링 속도와 효율성을 높이는 몇 가지 중요한 최적화 방법이에요.

# 헤드리스 모드로 실행
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# 이미지 로딩 차단
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

# 정규식 컴파일로 검색 속도 향상
import re
keyword_pattern = re.compile('|'.join(keywords), re.IGNORECASE)

def fast_keyword_check(text):
    return bool(keyword_pattern.search(text))

# 병렬 처리로 여러 카페 동시 크롤링
from concurrent.futures import ThreadPoolExecutor

def multi_cafe_crawler(cafe_urls, keywords):
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = []
        for cafe_url in cafe_urls:
            future = executor.submit(cafe_keyword_crawler, cafe_url, keywords[0], keywords)
            futures.append(future)
        
        all_results = []
        for future in futures:
            all_results.extend(future.result())
    
    return all_results

자동화 시스템 안정화 기법

네이버 카페에서도 자동화 시스템이 원활히 작동하도록 하는 안정화 방법이에요. 정상적인 사용자 패턴을 모방해 안정성을 높일 수 있어요.

import random

# 랜덤 딜레이 적용
def random_delay(min_sec=1, max_sec=3):
    time.sleep(random.uniform(min_sec, max_sec))

# User-Agent 로테이션
user_agents = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# 마우스 움직임 시뮬레이션
from selenium.webdriver.common.action_chains import ActionChains

def human_like_behavior(driver):
    actions = ActionChains(driver)
    # 랜덤 위치로 마우스 이동
    actions.move_by_offset(random.randint(0, 100), random.randint(0, 100))
    actions.perform()
    random_delay(0.5, 1.5)

개인적으로는 공식 API가 제공되지 않는 상황에서 이런 크롤링 기술이 데이터 수집의 중요한 도구라고 생각해요. 하지만 기술의 발전과 함께 플랫폼들도 더 나은 데이터 접근 방법을 제공해주길 바라요. 그때까지는 이런 방법들을 활용하되, 항상 윤리적이고 합법적인 범위 내에서 사용하는 것이 중요해요.

힙랩

이 블로그 검색