diff --git a/.gitignore b/.gitignore index dedbdfaf2..ccb9b9f10 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,7 @@ cython_debug/ /temp_image/ /browser_data/ /data/ +/cookies/ */.DS_Store .vscode diff --git a/config/base_config.py b/config/base_config.py index 94c472408..95e6fe1ef 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -43,6 +43,11 @@ # 是否保存登录状态 SAVE_LOGIN_STATE = True +# 是否自动保存和使用Cookie(优先使用保存的Cookie,避免重复扫码登录) +# 设置为True后,成功登录会自动保存Cookie到cookies/目录 +# 下次启动时会先尝试使用保存的Cookie,失败后才会要求重新登录 +AUTO_SAVE_AND_USE_COOKIES = True + # ==================== CDP (Chrome DevTools Protocol) 配置 ==================== # 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力 # 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制 diff --git a/docs/xiaohongshu_auth_improvement.md b/docs/xiaohongshu_auth_improvement.md new file mode 100644 index 000000000..29c89055b --- /dev/null +++ b/docs/xiaohongshu_auth_improvement.md @@ -0,0 +1,314 @@ +# Xiaohongshu (小红书) Authentication Improvement + +## Overview + +The authentication system for Xiaohongshu has been improved to avoid repeated QR code scanning on every run. The system now automatically saves and reuses cookies from successful login sessions. + +## Features + +### 1. Automatic Cookie Management +- **Auto-save**: After a successful QR code or phone login, cookies are automatically saved to `cookies/xhs_cookies.json` +- **Auto-load**: On subsequent runs, saved cookies are automatically loaded and validated +- **Smart fallback**: If saved cookies are invalid or expired, the system falls back to the configured login method + +### 2. Cookie Validation +- Cookies are validated using the `pong()` check before being used +- Invalid or expired cookies trigger a new login flow +- Cookie age warning (30+ days old) + +### 3. Multiple Login Methods +The system supports three login methods: +- **QR Code Login** (default): Scan QR code with Xiaohongshu mobile app +- **Phone Login**: SMS verification code +- **Cookie Login**: Use saved or manually provided cookies + +## Configuration + +### Enable/Disable Auto Cookie Management + +In `config/base_config.py`: + +```python +# Enable automatic cookie saving and usage (recommended) +AUTO_SAVE_AND_USE_COOKIES = True # Set to False to disable + +# Choose login method (qrcode, phone, or cookie) +LOGIN_TYPE = "qrcode" + +# Optional: Manually provide cookies (if AUTO_SAVE_AND_USE_COOKIES is False) +COOKIES = "" +``` + +### Configuration Options + +| Option | Values | Description | +|--------|--------|-------------| +| `AUTO_SAVE_AND_USE_COOKIES` | `True`/`False` | Enable automatic cookie management | +| `LOGIN_TYPE` | `"qrcode"`/`"phone"`/`"cookie"` | Primary login method | +| `SAVE_LOGIN_STATE` | `True`/`False` | Save browser session state | +| `COOKIES` | String | Manually provided cookies (optional) | + +## How It Works + +### First Run (No Saved Cookies) + +1. System checks for saved cookies in `cookies/xhs_cookies.json` +2. No cookies found → Prompt for QR code or phone login +3. After successful login → Automatically save cookies +4. Crawling begins + +### Subsequent Runs (With Saved Cookies) + +1. System loads cookies from `cookies/xhs_cookies.json` +2. Validates cookies with `pong()` check +3. **If valid** → Use cookies directly, no QR code needed ✅ +4. **If invalid** → Fall back to configured login method +5. Crawling begins + +### Login Flow Diagram + +``` +Start + ↓ +Check AUTO_SAVE_AND_USE_COOKIES + ↓ +[Enabled] → Load saved cookies → Validate with pong() + ↓ ↓ +[Valid] ✅ [Invalid] ❌ + ↓ ↓ +Use cookies Try configured login method + ↓ ↓ +Skip QR code scan! QR/Phone login → Save cookies + ↓ ↓ +Start crawling ←──────────────────────────┘ +``` + +## File Structure + +``` +MediaCrawler/ +├── cookies/ # Cookie storage (gitignored) +│ └── xhs_cookies.json # Saved Xiaohongshu cookies +├── media_platform/xhs/ +│ ├── cookie_manager.py # Cookie management utility +│ ├── login.py # Enhanced login logic +│ └── core.py # Updated authentication flow +└── config/ + └── base_config.py # Configuration options +``` + +## Cookie File Format + +The `cookies/xhs_cookies.json` file contains: + +```json +{ + "cookies": [ + { + "name": "web_session", + "value": "...", + "domain": ".xiaohongshu.com", + "path": "/", + ... + }, + ... + ], + "saved_at": 1234567890.123, + "saved_time": "2025-01-15 10:30:45" +} +``` + +## Security Considerations + +### ⚠️ Important +- **Never commit** `cookies/xhs_cookies.json` to version control +- The `cookies/` directory is automatically ignored in `.gitignore` +- Cookies contain sensitive authentication data +- Treat cookies like passwords + +### Cookie Expiration +- Cookies typically expire after 30 days of inactivity +- The system warns about old cookies (30+ days) +- Expired cookies trigger automatic re-authentication + +## Usage Examples + +### Example 1: Default Setup (Recommended) + +```python +# config/base_config.py +PLATFORM = "xhs" +LOGIN_TYPE = "qrcode" +AUTO_SAVE_AND_USE_COOKIES = True # Enable auto cookie management +``` + +**First run**: Scan QR code once → Cookies saved automatically +**Future runs**: No QR code needed! Cookies loaded automatically ✅ + +### Example 2: Manual Cookie Management + +```python +# config/base_config.py +PLATFORM = "xhs" +LOGIN_TYPE = "cookie" +AUTO_SAVE_AND_USE_COOKIES = False # Disable auto management +COOKIES = "web_session=xxx; a1=yyy; ..." # Manually provide cookies +``` + +### Example 3: Phone Login with Auto-Save + +```python +# config/base_config.py +PLATFORM = "xhs" +LOGIN_TYPE = "phone" +AUTO_SAVE_AND_USE_COOKIES = True +``` + +**First run**: Enter phone number and SMS code → Cookies saved +**Future runs**: No SMS code needed! Cookies loaded automatically ✅ + +## Troubleshooting + +### Issue: Cookies Not Being Saved + +**Possible causes:** +1. `AUTO_SAVE_AND_USE_COOKIES = False` in config +2. Login failed before cookie saving +3. File permission issues + +**Solution:** +- Check config: `AUTO_SAVE_AND_USE_COOKIES = True` +- Ensure login completes successfully +- Check write permissions for `cookies/` directory + +### Issue: Cookies Expired or Invalid + +**Symptoms:** +- Log message: "Saved cookies are invalid or expired" +- System falls back to QR code login + +**Solution:** +- This is normal behavior after long inactivity +- Simply login again with QR code +- New cookies will be saved automatically + +### Issue: "Cookie file not found" on First Run + +**This is normal!** +- First run has no saved cookies +- Complete QR code or phone login +- Cookies will be saved for future use + +## Benefits + +### Before This Improvement ❌ +- Scan QR code **every single time** +- Manual cookie management required +- Tedious repeated authentication + +### After This Improvement ✅ +- Scan QR code **only once** +- Automatic cookie persistence +- Seamless authentication on subsequent runs +- No manual intervention needed + +## API Reference + +### CookieManager Class + +Located in `media_platform/xhs/cookie_manager.py` + +#### Methods + +##### `save_cookies(cookies: List[Dict]) -> bool` +Save cookies to file with timestamp + +##### `load_cookies() -> Optional[List[Dict]]` +Load cookies from file, returns None if not found + +##### `clear_cookies() -> bool` +Delete saved cookie file + +##### `get_cookie_info() -> Optional[Dict]` +Get metadata about saved cookies (count, age, etc.) + +### XiaoHongShuLogin Class Enhancements + +#### New Method: `save_cookies_to_file() -> bool` +Automatically called after successful login when `AUTO_SAVE_AND_USE_COOKIES = True` + +#### Enhanced Method: `login_by_cookies()` +Now supports both: +- Loading from saved cookie file (if `cookie_str` is empty) +- Using manually provided `cookie_str` +- Imports **all** cookies, not just `web_session` + +## Migration Guide + +### Upgrading from Previous Version + +No migration needed! The improvement is backward compatible: + +1. Update your code +2. Set `AUTO_SAVE_AND_USE_COOKIES = True` in config +3. On first run, login as usual +4. Subsequent runs will use saved cookies automatically + +### Disabling the Feature + +If you prefer the old behavior: + +```python +# config/base_config.py +AUTO_SAVE_AND_USE_COOKIES = False +``` + +## Technical Details + +### Cookie Storage Location +- Default: `cookies/xhs_cookies.json` +- Configurable via `CookieManager(cookie_dir="custom_path")` + +### Cookie Validation Process +1. Load cookies from file +2. Add cookies to browser context +3. Call `xhs_client.pong()` to test validity +4. Search for keyword "小红书" as validation test +5. Return success/failure + +### Important Cookies +The system saves all cookies, but these are most critical: +- `web_session`: Main session identifier +- `a1`: Authentication token +- `webId`: Device identifier +- `gid`: User group identifier + +## FAQ + +**Q: How long do cookies last?** +A: Typically 30 days from last use, but can vary. + +**Q: Can I use cookies from multiple accounts?** +A: No, only one account at a time. To switch accounts, delete `cookies/xhs_cookies.json` and login with a different account. + +**Q: Are cookies safe to store?** +A: Cookies are stored locally and gitignored by default. Never share or commit them. + +**Q: What happens if I delete the cookie file?** +A: Next run will require login again. New cookies will be saved. + +**Q: Can I copy cookies between machines?** +A: Technically yes, but not recommended due to IP/device fingerprinting. + +## Contributing + +To extend cookie management to other platforms (Douyin, Bilibili, etc.), use the `CookieManager` class as a reference implementation. + +## License + +This enhancement follows the project's NON-COMMERCIAL LEARNING LICENSE 1.1. + +--- + +**Last Updated**: 2025-11-20 +**Version**: 1.0.0 diff --git a/docs/xiaohongshu_auth_improvement_zh.md b/docs/xiaohongshu_auth_improvement_zh.md new file mode 100644 index 000000000..5e2ec2a13 --- /dev/null +++ b/docs/xiaohongshu_auth_improvement_zh.md @@ -0,0 +1,314 @@ +# 小红书认证改进说明 + +## 概述 + +小红书的认证系统已经改进,避免每次运行都需要扫描二维码。系统现在会自动保存和重用成功登录后的Cookie。 + +## 功能特性 + +### 1. 自动Cookie管理 +- **自动保存**:二维码或手机号登录成功后,Cookie会自动保存到 `cookies/xhs_cookies.json` +- **自动加载**:后续运行时,系统会自动加载并验证保存的Cookie +- **智能回退**:如果保存的Cookie无效或过期,系统会回退到配置的登录方式 + +### 2. Cookie验证 +- 使用 `pong()` 检查来验证Cookie是否有效 +- 无效或过期的Cookie会触发新的登录流程 +- Cookie年龄警告(30天以上) + +### 3. 多种登录方式 +系统支持三种登录方式: +- **二维码登录**(默认):使用小红书手机App扫码 +- **手机号登录**:短信验证码 +- **Cookie登录**:使用保存的或手动提供的Cookie + +## 配置 + +### 启用/禁用自动Cookie管理 + +在 `config/base_config.py` 中: + +```python +# 启用自动Cookie保存和使用(推荐) +AUTO_SAVE_AND_USE_COOKIES = True # 设置为False可禁用 + +# 选择登录方式(qrcode、phone或cookie) +LOGIN_TYPE = "qrcode" + +# 可选:手动提供Cookie(如果AUTO_SAVE_AND_USE_COOKIES为False) +COOKIES = "" +``` + +### 配置选项 + +| 选项 | 值 | 说明 | +|------|---|------| +| `AUTO_SAVE_AND_USE_COOKIES` | `True`/`False` | 启用自动Cookie管理 | +| `LOGIN_TYPE` | `"qrcode"`/`"phone"`/`"cookie"` | 主要登录方式 | +| `SAVE_LOGIN_STATE` | `True`/`False` | 保存浏览器会话状态 | +| `COOKIES` | 字符串 | 手动提供的Cookie(可选)| + +## 工作原理 + +### 首次运行(无保存的Cookie) + +1. 系统检查 `cookies/xhs_cookies.json` 中的保存Cookie +2. 未找到Cookie → 提示二维码或手机号登录 +3. 登录成功后 → 自动保存Cookie +4. 开始爬取 + +### 后续运行(有保存的Cookie) + +1. 系统从 `cookies/xhs_cookies.json` 加载Cookie +2. 使用 `pong()` 检查验证Cookie +3. **如果有效** → 直接使用Cookie,无需扫码 ✅ +4. **如果无效** → 回退到配置的登录方式 +5. 开始爬取 + +### 登录流程图 + +``` +开始 + ↓ +检查 AUTO_SAVE_AND_USE_COOKIES + ↓ +[已启用] → 加载保存的Cookie → 用pong()验证 + ↓ ↓ +[有效] ✅ [无效] ❌ + ↓ ↓ +使用Cookie 尝试配置的登录方式 + ↓ ↓ +跳过扫码! 二维码/手机登录 → 保存Cookie + ↓ ↓ +开始爬取 ←──────────────────────────┘ +``` + +## 文件结构 + +``` +MediaCrawler/ +├── cookies/ # Cookie存储(已加入.gitignore) +│ └── xhs_cookies.json # 保存的小红书Cookie +├── media_platform/xhs/ +│ ├── cookie_manager.py # Cookie管理工具 +│ ├── login.py # 增强的登录逻辑 +│ └── core.py # 更新的认证流程 +└── config/ + └── base_config.py # 配置选项 +``` + +## Cookie文件格式 + +`cookies/xhs_cookies.json` 文件包含: + +```json +{ + "cookies": [ + { + "name": "web_session", + "value": "...", + "domain": ".xiaohongshu.com", + "path": "/", + ... + }, + ... + ], + "saved_at": 1234567890.123, + "saved_time": "2025-01-15 10:30:45" +} +``` + +## 安全注意事项 + +### ⚠️ 重要 +- **切勿提交** `cookies/xhs_cookies.json` 到版本控制 +- `cookies/` 目录已自动添加到 `.gitignore` +- Cookie包含敏感的认证数据 +- 像对待密码一样对待Cookie + +### Cookie过期 +- Cookie通常在30天不活动后过期 +- 系统会对旧Cookie(30+天)发出警告 +- 过期的Cookie会触发自动重新认证 + +## 使用示例 + +### 示例1:默认设置(推荐) + +```python +# config/base_config.py +PLATFORM = "xhs" +LOGIN_TYPE = "qrcode" +AUTO_SAVE_AND_USE_COOKIES = True # 启用自动Cookie管理 +``` + +**首次运行**:扫码一次 → Cookie自动保存 +**后续运行**:无需扫码!Cookie自动加载 ✅ + +### 示例2:手动Cookie管理 + +```python +# config/base_config.py +PLATFORM = "xhs" +LOGIN_TYPE = "cookie" +AUTO_SAVE_AND_USE_COOKIES = False # 禁用自动管理 +COOKIES = "web_session=xxx; a1=yyy; ..." # 手动提供Cookie +``` + +### 示例3:手机号登录并自动保存 + +```python +# config/base_config.py +PLATFORM = "xhs" +LOGIN_TYPE = "phone" +AUTO_SAVE_AND_USE_COOKIES = True +``` + +**首次运行**:输入手机号和验证码 → Cookie保存 +**后续运行**:无需验证码!Cookie自动加载 ✅ + +## 问题排查 + +### 问题:Cookie没有被保存 + +**可能原因:** +1. 配置中 `AUTO_SAVE_AND_USE_COOKIES = False` +2. Cookie保存前登录失败 +3. 文件权限问题 + +**解决方案:** +- 检查配置:`AUTO_SAVE_AND_USE_COOKIES = True` +- 确保登录成功完成 +- 检查 `cookies/` 目录的写入权限 + +### 问题:Cookie过期或无效 + +**症状:** +- 日志信息:"Saved cookies are invalid or expired" +- 系统回退到二维码登录 + +**解决方案:** +- 这是长时间不活动后的正常行为 +- 只需再次扫码登录 +- 新的Cookie会自动保存 + +### 问题:首次运行时"Cookie file not found" + +**这是正常的!** +- 首次运行没有保存的Cookie +- 完成二维码或手机号登录 +- Cookie会为将来使用而保存 + +## 优势对比 + +### 改进前 ❌ +- **每次**都需要扫码 +- 需要手动管理Cookie +- 重复认证很繁琐 + +### 改进后 ✅ +- **只需扫码一次** +- Cookie自动持久化 +- 后续运行无缝认证 +- 无需手动干预 + +## API参考 + +### CookieManager类 + +位于 `media_platform/xhs/cookie_manager.py` + +#### 方法 + +##### `save_cookies(cookies: List[Dict]) -> bool` +保存Cookie到文件,带时间戳 + +##### `load_cookies() -> Optional[List[Dict]]` +从文件加载Cookie,如果未找到则返回None + +##### `clear_cookies() -> bool` +删除保存的Cookie文件 + +##### `get_cookie_info() -> Optional[Dict]` +获取保存Cookie的元数据(数量、年龄等) + +### XiaoHongShuLogin类增强 + +#### 新方法:`save_cookies_to_file() -> bool` +当 `AUTO_SAVE_AND_USE_COOKIES = True` 时,登录成功后自动调用 + +#### 增强方法:`login_by_cookies()` +现在支持: +- 从保存的Cookie文件加载(如果 `cookie_str` 为空) +- 使用手动提供的 `cookie_str` +- 导入**所有**Cookie,不仅仅是 `web_session` + +## 迁移指南 + +### 从旧版本升级 + +无需迁移!改进是向后兼容的: + +1. 更新代码 +2. 在配置中设置 `AUTO_SAVE_AND_USE_COOKIES = True` +3. 首次运行时正常登录 +4. 后续运行将自动使用保存的Cookie + +### 禁用该功能 + +如果您更喜欢旧的行为: + +```python +# config/base_config.py +AUTO_SAVE_AND_USE_COOKIES = False +``` + +## 技术细节 + +### Cookie存储位置 +- 默认:`cookies/xhs_cookies.json` +- 可通过 `CookieManager(cookie_dir="自定义路径")` 配置 + +### Cookie验证过程 +1. 从文件加载Cookie +2. 将Cookie添加到浏览器上下文 +3. 调用 `xhs_client.pong()` 测试有效性 +4. 搜索关键词"小红书"作为验证测试 +5. 返回成功/失败 + +### 重要的Cookie +系统保存所有Cookie,但这些最关键: +- `web_session`:主会话标识符 +- `a1`:认证令牌 +- `webId`:设备标识符 +- `gid`:用户组标识符 + +## 常见问题 + +**问:Cookie能保持多久?** +答:通常从最后一次使用起30天,但可能有所不同。 + +**问:我可以使用多个账户的Cookie吗?** +答:不可以,一次只能一个账户。要切换账户,删除 `cookies/xhs_cookies.json` 并使用不同的账户登录。 + +**问:存储Cookie安全吗?** +答:Cookie存储在本地,默认已加入.gitignore。切勿分享或提交它们。 + +**问:如果我删除Cookie文件会怎样?** +答:下次运行将需要再次登录。新的Cookie会被保存。 + +**问:我可以在机器之间复制Cookie吗?** +答:技术上可以,但由于IP/设备指纹识别,不推荐这样做。 + +## 贡献 + +要将Cookie管理扩展到其他平台(抖音、B站等),请使用 `CookieManager` 类作为参考实现。 + +## 许可证 + +此增强遵循项目的非商业学习许可证1.1。 + +--- + +**最后更新**:2025-11-20 +**版本**:1.0.0 diff --git a/media_platform/xhs/cookie_manager.py b/media_platform/xhs/cookie_manager.py new file mode 100644 index 000000000..b89249331 --- /dev/null +++ b/media_platform/xhs/cookie_manager.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/xhs/cookie_manager.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# + +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import json +import os +import time +from pathlib import Path +from typing import Dict, List, Optional + +from tools import utils + + +class CookieManager: + """管理小红书的Cookie持久化存储和加载""" + + def __init__(self, cookie_dir: str = "cookies"): + """ + 初始化Cookie管理器 + + Args: + cookie_dir: Cookie存储目录 + """ + self.cookie_dir = Path(cookie_dir) + self.cookie_dir.mkdir(parents=True, exist_ok=True) + self.cookie_file = self.cookie_dir / "xhs_cookies.json" + + def save_cookies(self, cookies: List[Dict]) -> bool: + """ + 保存cookies到文件 + + Args: + cookies: Playwright格式的cookie列表 + + Returns: + bool: 保存是否成功 + """ + try: + # 添加保存时间戳 + cookie_data = { + "cookies": cookies, + "saved_at": time.time(), + "saved_time": time.strftime("%Y-%m-%d %H:%M:%S") + } + + with open(self.cookie_file, 'w', encoding='utf-8') as f: + json.dump(cookie_data, f, ensure_ascii=False, indent=2) + + utils.logger.info( + f"[CookieManager.save_cookies] Successfully saved {len(cookies)} cookies to {self.cookie_file}" + ) + return True + + except Exception as e: + utils.logger.error( + f"[CookieManager.save_cookies] Failed to save cookies: {e}" + ) + return False + + def load_cookies(self) -> Optional[List[Dict]]: + """ + 从文件加载cookies + + Returns: + Optional[List[Dict]]: Playwright格式的cookie列表,如果文件不存在或加载失败则返回None + """ + if not self.cookie_file.exists(): + utils.logger.info( + f"[CookieManager.load_cookies] Cookie file not found: {self.cookie_file}" + ) + return None + + try: + with open(self.cookie_file, 'r', encoding='utf-8') as f: + cookie_data = json.load(f) + + cookies = cookie_data.get("cookies", []) + saved_at = cookie_data.get("saved_at", 0) + saved_time = cookie_data.get("saved_time", "Unknown") + + # 检查cookie是否过期(30天) + if time.time() - saved_at > 30 * 24 * 3600: + utils.logger.warning( + f"[CookieManager.load_cookies] Cookies are older than 30 days (saved at {saved_time}), may be expired" + ) + + utils.logger.info( + f"[CookieManager.load_cookies] Successfully loaded {len(cookies)} cookies from {self.cookie_file} (saved at {saved_time})" + ) + return cookies + + except Exception as e: + utils.logger.error( + f"[CookieManager.load_cookies] Failed to load cookies: {e}" + ) + return None + + def clear_cookies(self) -> bool: + """ + 清除保存的cookies文件 + + Returns: + bool: 清除是否成功 + """ + try: + if self.cookie_file.exists(): + self.cookie_file.unlink() + utils.logger.info( + f"[CookieManager.clear_cookies] Successfully cleared cookies file: {self.cookie_file}" + ) + return True + + except Exception as e: + utils.logger.error( + f"[CookieManager.clear_cookies] Failed to clear cookies: {e}" + ) + return False + + def get_cookie_info(self) -> Optional[Dict]: + """ + 获取保存的cookie信息(不包含实际cookie数据) + + Returns: + Optional[Dict]: Cookie信息字典,包含保存时间、数量等 + """ + if not self.cookie_file.exists(): + return None + + try: + with open(self.cookie_file, 'r', encoding='utf-8') as f: + cookie_data = json.load(f) + + return { + "saved_at": cookie_data.get("saved_at", 0), + "saved_time": cookie_data.get("saved_time", "Unknown"), + "cookie_count": len(cookie_data.get("cookies", [])), + "file_path": str(self.cookie_file) + } + + except Exception as e: + utils.logger.error( + f"[CookieManager.get_cookie_info] Failed to get cookie info: {e}" + ) + return None diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index cbf49e06c..579dab9f5 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -47,6 +47,7 @@ from .field import SearchSortType from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id from .login import XiaoHongShuLogin +from .cookie_manager import CookieManager class XiaoHongShuCrawler(AbstractCrawler): @@ -96,7 +97,34 @@ async def start(self) -> None: # Create a client to interact with the xiaohongshu website. self.xhs_client = await self.create_xhs_client(httpx_proxy_format) - if not await self.xhs_client.pong(): + + # Try to load saved cookies first if available and AUTO_SAVE_AND_USE_COOKIES is enabled + login_successful = False + cookie_manager = CookieManager() + + # If LOGIN_TYPE is cookie or if AUTO_SAVE_AND_USE_COOKIES is enabled with saved cookies, try cookie login first + if config.LOGIN_TYPE == "cookie" or (config.AUTO_SAVE_AND_USE_COOKIES and cookie_manager.load_cookies()): + utils.logger.info("[XiaoHongShuCrawler] Attempting to use saved cookies for authentication ...") + login_obj = XiaoHongShuLogin( + login_type="cookie", + login_phone="", + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + await self.xhs_client.update_cookies(browser_context=self.browser_context) + + # Validate if cookie login was successful + if await self.xhs_client.pong(): + utils.logger.info("[XiaoHongShuCrawler] Cookie authentication successful! No QR code scan needed.") + login_successful = True + else: + utils.logger.warning("[XiaoHongShuCrawler] Saved cookies are invalid or expired, will require new login") + + # If cookie login failed or wasn't attempted, use configured login method + if not login_successful and not await self.xhs_client.pong(): + utils.logger.info(f"[XiaoHongShuCrawler] Attempting login with method: {config.LOGIN_TYPE}") login_obj = XiaoHongShuLogin( login_type=config.LOGIN_TYPE, login_phone="", # input your phone number diff --git a/media_platform/xhs/login.py b/media_platform/xhs/login.py index 6b2ed06d3..922c65c02 100644 --- a/media_platform/xhs/login.py +++ b/media_platform/xhs/login.py @@ -31,6 +31,7 @@ from base.base_crawler import AbstractLogin from cache.cache_factory import CacheFactory from tools import utils +from .cookie_manager import CookieManager class XiaoHongShuLogin(AbstractLogin): @@ -47,6 +48,7 @@ def __init__(self, self.context_page = context_page self.login_phone = login_phone self.cookie_str = cookie_str + self.cookie_manager = CookieManager() @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) async def check_login_state(self, no_logged_in_session: str) -> bool: @@ -71,8 +73,14 @@ async def begin(self): utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...") if config.LOGIN_TYPE == "qrcode": await self.login_by_qrcode() + # Auto-save cookies after successful QR code login if enabled + if config.AUTO_SAVE_AND_USE_COOKIES: + await self.save_cookies_to_file() elif config.LOGIN_TYPE == "phone": await self.login_by_mobile() + # Auto-save cookies after successful phone login if enabled + if config.AUTO_SAVE_AND_USE_COOKIES: + await self.save_cookies_to_file() elif config.LOGIN_TYPE == "cookie": await self.login_by_cookies() else: @@ -195,12 +203,49 @@ async def login_by_qrcode(self): async def login_by_cookies(self): """login xiaohongshu website by cookies""" utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...") - for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): - if key != "web_session": # only set web_session cookie attr - continue - await self.browser_context.add_cookies([{ + + # First try to load cookies from file if cookie_str is empty + if not self.cookie_str: + utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Attempting to load cookies from file ...") + saved_cookies = self.cookie_manager.load_cookies() + if saved_cookies: + await self.browser_context.add_cookies(saved_cookies) + utils.logger.info(f"[XiaoHongShuLogin.login_by_cookies] Successfully loaded {len(saved_cookies)} cookies from file") + return + else: + utils.logger.warning("[XiaoHongShuLogin.login_by_cookies] No saved cookies found, please login manually first") + return + + # If cookie_str is provided, use it + cookie_dict = utils.convert_str_cookie_to_dict(self.cookie_str) + cookies_to_add = [] + + # Add all cookies, not just web_session + for key, value in cookie_dict.items(): + cookies_to_add.append({ 'name': key, 'value': value, 'domain': ".xiaohongshu.com", 'path': "/" - }]) + }) + + if cookies_to_add: + await self.browser_context.add_cookies(cookies_to_add) + utils.logger.info(f"[XiaoHongShuLogin.login_by_cookies] Successfully added {len(cookies_to_add)} cookies from cookie string") + + async def save_cookies_to_file(self): + """Save current browser cookies to file for future use""" + utils.logger.info("[XiaoHongShuLogin.save_cookies_to_file] Saving cookies to file ...") + try: + current_cookies = await self.browser_context.cookies() + if current_cookies: + success = self.cookie_manager.save_cookies(current_cookies) + if success: + utils.logger.info( + f"[XiaoHongShuLogin.save_cookies_to_file] Successfully saved {len(current_cookies)} cookies" + ) + return True + return False + except Exception as e: + utils.logger.error(f"[XiaoHongShuLogin.save_cookies_to_file] Failed to save cookies: {e}") + return False