Без названия

2025-03-20 23:03 | Публичная
import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote, parse_qs, urlparse
import phonenumbers
import os

# Общие настройки
COMMON_YT_NUMBERS = {"+71065286052", "+71124342058", "+71741801509", "+74278607828",
                     "+74279440147", "+74282296063", "+74294967295", "+76861867695",
                     "+77741124305", "+78800000000", "+71741801753"}

EXCLUDED_NUMBERS = {"+74278607828", "+74282296063", "+74895550089", "+79600380000",
                    "+79600360000", "+76765370000", "+76765380000", "+79099990870",
                    "+78000000000", "+79600390000", "+76765390000"}

# Кэш для Telegram ссылок
telegram_cache = {}

# Ограничение запросов
MAX_CONCURRENT_REQUESTS = 150
REQUEST_DELAY = 0.5  # Задержка между запросами (секунды)

# HTTP заголовки
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html'
}

semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def fetch_page(session, url, retries=5):
    """Загружает страницу с повторными попытками."""
    for attempt in range(retries):
        try:
            async with semaphore:
                await asyncio.sleep(REQUEST_DELAY)
                async with session.get(url, headers=HEADERS, timeout=aiohttp.ClientTimeout(total=10)) as response:
                    return await response.text()
        except Exception as e:
            print(f"[Ошибка] Попытка {attempt + 1} не удалась для {url}: {e}")
            await asyncio.sleep(2)  # Ждем перед повтором
    return None  # Если все попытки не удались

async def check_telegram_link(session, telegram_url):
    """Проверяет, активна ли Telegram-ссылка."""
    if telegram_url in telegram_cache:
        return telegram_cache[telegram_url]

    try:
        async with semaphore:
            await asyncio.sleep(REQUEST_DELAY)
            async with session.get(telegram_url, headers=HEADERS, timeout=aiohttp.ClientTimeout(total=5)) as response:
                page_source = (await response.text()).upper()
                result = any(phrase in page_source for phrase in [
                    "SEND MESSAGE", "WRITE A MESSAGE", "START CHATTING", "OPEN CHAT", "TELEGRAM"
                ])
                telegram_cache[telegram_url] = result
                return result
    except Exception as e:
        print(f"[Ошибка] Проверка Telegram {telegram_url} не удалась: {e}")
        telegram_cache[telegram_url] = False
        return False

def find_telegram_links(text):
    """Находит Telegram ссылки в тексте."""
    pattern = r'(?:https?://)?(?:t\.me|telegram\.(?:me|dog))/[a-zA-Z0-9_-]+'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return list(set(f'https://{match}' if not match.startswith('http') else match for match in matches))

def find_phone_numbers(text):
    """Находит и валидирует номера телефонов."""
    phone_pattern = r'(?:\+7|8)[\s\-()]*(?:495|800|9\d{2})[\s\-()]*\d{3}[\s\-()]*\d{2}[\s\-()]*\d{2}'
    potential_numbers = re.findall(phone_pattern, text)
    valid_numbers = set()

    for number in potential_numbers:
        cleaned = re.sub(r'[^\d+]', '', number)
        if cleaned.startswith('8'):
            cleaned = "+7" + cleaned[1:]
        if cleaned not in EXCLUDED_NUMBERS:
            try:
                parsed_number = phonenumbers.parse(cleaned, "RU")
                if phonenumbers.is_valid_number(parsed_number):
                    valid_numbers.add(cleaned)
            except:
                continue

    return list(valid_numbers)

async def get_data(youtube_url, session):
    """Извлекает Telegram-ссылки и номера телефонов с YouTube-канала."""
    about_url = youtube_url + '/about' if not youtube_url.endswith('/about') else youtube_url
    page_source = await fetch_page(session, about_url)

    if not page_source:
        return {'telegram': [], 'phones': []}

    soup = BeautifulSoup(page_source, 'lxml')
    telegram_links = set()

    for link in soup.find_all('a', href=True):
        href = link['href']
        if 't.me/' in href:
            telegram_links.add(href)

    text_links = find_telegram_links(page_source)
    telegram_links.update(text_links)

    active_links = set()
    for link in telegram_links:
        if await check_telegram_link(session, link):
            active_links.add(link)

    numbers = find_phone_numbers(page_source)

    return {'telegram': list(active_links), 'phones': numbers}

async def process_channel(youtube_url, session, results_list):
    """Обрабатывает один YouTube-канал."""
    print(f"Обрабатываю: {youtube_url}")
    data = await get_data(youtube_url, session)

    if data['telegram'] or data['phones']:
        result = f"{youtube_url}"
        if data['telegram']:
            result += f" | TG: {', '.join(data['telegram'])}"
        if data['phones']:
            result += f" | Phone: {', '.join(data['phones'])}"
        results_list.append(result)

async def main():
    """Основная функция обработки."""
    results_list = []
    async with aiohttp.ClientSession() as session:
        with open('CHANNELS_LIST.txt', 'r', encoding='utf-8') as f:
            channels = [url.strip() for url in f.readlines() if url.strip()]
        
        tasks = [process_channel(channel, session, results_list) for channel in channels]
        await asyncio.gather(*tasks)

    with open('COMBINED_RESULTS.txt', 'w', encoding='utf-8') as f:
        f.write("=== Parsing Results ===\n")
        for line in results_list:
            f.write(line + '\n')
        f.write("\n=== Parsing Completed ===")

if __name__ == "__main__":
    asyncio.run(main())
Вернуться ко Всем Вставкам
Открыть чат
Чат с Send-Code AI Закрыть чат