import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote, parse_qs, urlparse
import phonenumbers
import os
# Общие настройки
COMMON_YT_NUMBERS = {"+71065286052", "+71124342058", "+71741801509", "+74278607828",
"+74279440147", "+74282296063", "+74294967295", "+76861867695",
"+77741124305", "+78800000000", "+71741801753"}
EXCLUDED_NUMBERS = {"+74278607828", "+74282296063", "+74895550089", "+79600380000",
"+79600360000", "+76765370000", "+76765380000", "+79099990870",
"+78000000000", "+79600390000", "+76765390000"}
# Кэш для Telegram ссылок
telegram_cache = {}
# Ограничение запросов
MAX_CONCURRENT_REQUESTS = 150
REQUEST_DELAY = 0.5 # Задержка между запросами (секунды)
# HTTP заголовки
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html'
}
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
async def fetch_page(session, url, retries=5):
"""Загружает страницу с повторными попытками."""
for attempt in range(retries):
try:
async with semaphore:
await asyncio.sleep(REQUEST_DELAY)
async with session.get(url, headers=HEADERS, timeout=aiohttp.ClientTimeout(total=10)) as response:
return await response.text()
except Exception as e:
print(f"[Ошибка] Попытка {attempt + 1} не удалась для {url}: {e}")
await asyncio.sleep(2) # Ждем перед повтором
return None # Если все попытки не удались
async def check_telegram_link(session, telegram_url):
"""Проверяет, активна ли Telegram-ссылка."""
if telegram_url in telegram_cache:
return telegram_cache[telegram_url]
try:
async with semaphore:
await asyncio.sleep(REQUEST_DELAY)
async with session.get(telegram_url, headers=HEADERS, timeout=aiohttp.ClientTimeout(total=5)) as response:
page_source = (await response.text()).upper()
result = any(phrase in page_source for phrase in [
"SEND MESSAGE", "WRITE A MESSAGE", "START CHATTING", "OPEN CHAT", "TELEGRAM"
])
telegram_cache[telegram_url] = result
return result
except Exception as e:
print(f"[Ошибка] Проверка Telegram {telegram_url} не удалась: {e}")
telegram_cache[telegram_url] = False
return False
def find_telegram_links(text):
"""Находит Telegram ссылки в тексте."""
pattern = r'(?:https?://)?(?:t\.me|telegram\.(?:me|dog))/[a-zA-Z0-9_-]+'
matches = re.findall(pattern, text, re.IGNORECASE)
return list(set(f'https://{match}' if not match.startswith('http') else match for match in matches))
def find_phone_numbers(text):
"""Находит и валидирует номера телефонов."""
phone_pattern = r'(?:\+7|8)[\s\-()]*(?:495|800|9\d{2})[\s\-()]*\d{3}[\s\-()]*\d{2}[\s\-()]*\d{2}'
potential_numbers = re.findall(phone_pattern, text)
valid_numbers = set()
for number in potential_numbers:
cleaned = re.sub(r'[^\d+]', '', number)
if cleaned.startswith('8'):
cleaned = "+7" + cleaned[1:]
if cleaned not in EXCLUDED_NUMBERS:
try:
parsed_number = phonenumbers.parse(cleaned, "RU")
if phonenumbers.is_valid_number(parsed_number):
valid_numbers.add(cleaned)
except:
continue
return list(valid_numbers)
async def get_data(youtube_url, session):
"""Извлекает Telegram-ссылки и номера телефонов с YouTube-канала."""
about_url = youtube_url + '/about' if not youtube_url.endswith('/about') else youtube_url
page_source = await fetch_page(session, about_url)
if not page_source:
return {'telegram': [], 'phones': []}
soup = BeautifulSoup(page_source, 'lxml')
telegram_links = set()
for link in soup.find_all('a', href=True):
href = link['href']
if 't.me/' in href:
telegram_links.add(href)
text_links = find_telegram_links(page_source)
telegram_links.update(text_links)
active_links = set()
for link in telegram_links:
if await check_telegram_link(session, link):
active_links.add(link)
numbers = find_phone_numbers(page_source)
return {'telegram': list(active_links), 'phones': numbers}
async def process_channel(youtube_url, session, results_list):
"""Обрабатывает один YouTube-канал."""
print(f"Обрабатываю: {youtube_url}")
data = await get_data(youtube_url, session)
if data['telegram'] or data['phones']:
result = f"{youtube_url}"
if data['telegram']:
result += f" | TG: {', '.join(data['telegram'])}"
if data['phones']:
result += f" | Phone: {', '.join(data['phones'])}"
results_list.append(result)
async def main():
"""Основная функция обработки."""
results_list = []
async with aiohttp.ClientSession() as session:
with open('CHANNELS_LIST.txt', 'r', encoding='utf-8') as f:
channels = [url.strip() for url in f.readlines() if url.strip()]
tasks = [process_channel(channel, session, results_list) for channel in channels]
await asyncio.gather(*tasks)
with open('COMBINED_RESULTS.txt', 'w', encoding='utf-8') as f:
f.write("=== Parsing Results ===\n")
for line in results_list:
f.write(line + '\n')
f.write("\n=== Parsing Completed ===")
if __name__ == "__main__":
asyncio.run(main())