from __future__ import annotations

import dataclasses
import html
import math
import re
import xml.etree.ElementTree as ET
import time
from dataclasses import dataclass
from typing import Iterable
from urllib.parse import parse_qs, unquote, urlparse

import requests

DUCKDUCKGO_HTML = "https://html.duckduckgo.com/html/"
DEFAULT_SOURCES = [
    "site:pap.fr",
    "site:seloger.com",
    "site:leboncoin.fr",
    "site:bienici.com",
    "site:green-acres.fr",
]

# Rough Montpellier ring towns, enough to make the queries useful without
# requiring a geocoding service for every run.
NEARBY_TOWNS: dict[str, tuple[float, float]] = {
    "Montpellier": (43.6108, 3.8767),
    "Castelnau-le-Lez": (43.6351, 3.9006),
    "Lattes": (43.5681, 3.90),
    "Pérols": (43.5585, 3.95),
    "Mauguio": (43.6167, 4.0),
    "Saint-Jean-de-Védas": (43.5721, 3.8267),
    "Juvignac": (43.6125, 3.8025),
    "Clapiers": (43.66, 3.89),
    "Grabels": (43.61, 3.80),
    "Villeneuve-lès-Maguelone": (43.522, 3.857),
    "Le Crès": (43.64, 3.93),
    "Fabrègues": (43.55, 3.77),
    "Saint-Gély-du-Fesc": (43.69, 3.80),
    "Palavas-les-Flots": (43.53, 3.93),
    "Castries": (43.68, 3.98),
    "Baillargues": (43.66, 4.0),
    "Mireval": (43.52, 3.78),
}

PROPERTY_HINTS = {
    "maison": ["maison", "villa", "demeure"],
    "appartement": ["appartement"],
    "terrain": ["terrain"],
}


@dataclass(frozen=True)
class NearbyTown:
    name: str
    lat: float
    lon: float


@dataclass
class SearchResult:
    title: str
    url: str
    snippet: str
    source: str
    fetched_at: float = dataclasses.field(default_factory=time.time)
    price: int | None = None
    town: str | None = None
    distance_km: float | None = None
    score: float = 0.0


def _haversine_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    r = 6371.0
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) ** 2
    )
    return 2 * r * math.asin(math.sqrt(a))


def _normalise_url(url: str) -> str:
    url = html.unescape(url).strip()
    if url.startswith("//duckduckgo.com/l/?"):
        parsed = urlparse("https:" + url)
    else:
        parsed = urlparse(url)
    if parsed.netloc.endswith("duckduckgo.com") and parsed.path.startswith("/l/"):
        qs = parse_qs(parsed.query)
        uddg = qs.get("uddg", [None])[0]
        if uddg:
            return unquote(uddg)
    return url


def nearby_towns_within(radius_km: float) -> list[NearbyTown]:
    center_lat, center_lon = NEARBY_TOWNS["Montpellier"]
    towns = []
    for name, (lat, lon) in NEARBY_TOWNS.items():
        dist = _haversine_km(center_lat, center_lon, lat, lon)
        if dist <= radius_km + 0.25:
            towns.append(NearbyTown(name=name, lat=lat, lon=lon))
    return sorted(towns, key=lambda t: _haversine_km(center_lat, center_lon, t.lat, t.lon))


def build_queries(
    center: str,
    radius_km: float,
    max_price: int,
    property_type: str,
    sources: Iterable[str] | None = None,
) -> list[str]:
    towns = nearby_towns_within(radius_km)
    if center not in {t.name for t in towns} and center in NEARBY_TOWNS:
        towns = [NearbyTown(center, *NEARBY_TOWNS[center])] + towns

    property_type = property_type.strip().lower() or "maison"
    if property_type not in PROPERTY_HINTS:
        property_type = "maison"

    queries: list[str] = []
    for town in towns:
        queries.append(
            " ".join(
                [
                    property_type,
                    "à vendre",
                    f'"{town.name}"' if " " in town.name else town.name,
                ]
            )
        )

    # Add a broader catch-all query for nearby communes + center.
    commune_blob = " OR ".join(f'"{town.name}"' if " " in town.name else town.name for town in towns)
    queries.append(f"{property_type} à vendre ({commune_blob})")
    return queries


def parse_duckduckgo_results(html_text: str) -> list[dict[str, str]]:
    # Pair result links with subsequent snippets. DDG HTML is annoyingly loose,
    # so a small regex parser is enough here.
    links = []
    for href, title in re.findall(r'<a[^>]*class="result__a"[^>]*href="(.*?)"[^>]*>(.*?)</a>', html_text, flags=re.S):
        links.append({"url": _normalise_url(href), "title": re.sub(r"<.*?>", "", html.unescape(title)).strip()})

    snippets = [
        re.sub(r"<.*?>", "", html.unescape(snippet)).strip()
        for snippet in re.findall(r'<a[^>]*class="result__snippet"[^>]*>(.*?)</a>', html_text, flags=re.S)
    ]

    results: list[dict[str, str]] = []
    for idx, link in enumerate(links):
        result = {"title": link["title"], "url": link["url"], "snippet": snippets[idx] if idx < len(snippets) else ""}
        results.append(result)
    return results


PRICE_WITH_CURRENCY_RE = re.compile(
    r"(?<!\d)(\d{1,3}(?:[\s.,]\d{3})+|\d{4,6})\s*(?:€|eur|euros)|(?<!\d)(\d{2,3})\s*(k|K)"
)


def extract_price(text: str) -> int | None:
    lowered = text.lower()
    if "prix sur demande" in lowered or "sur demande" in lowered:
        return None
    match = PRICE_WITH_CURRENCY_RE.search(text)
    if not match:
        return None
    millionish, short_k, suffix = match.groups()
    if short_k:
        return int(short_k) * 1000
    if not millionish:
        return None
    cleaned = re.sub(r"[\s.,]", "", millionish)
    if not cleaned.isdigit():
        return None
    return int(cleaned)


def _guess_town(text: str) -> tuple[str | None, float | None]:
    lowered = text.lower()
    center_lat, center_lon = NEARBY_TOWNS["Montpellier"]
    best: tuple[str | None, float | None] = (None, None)
    for name, (lat, lon) in NEARBY_TOWNS.items():
        if name.lower() in lowered:
            dist = _haversine_km(center_lat, center_lon, lat, lon)
            if best[1] is None or dist < best[1]:
                best = (name, round(dist, 1))
    return best


def score_result(result: dict[str, str], center: str, radius_km: float, max_price: int, property_type: str) -> float:
    haystack = f"{result.get('title', '')} {result.get('snippet', '')}".lower()
    score = 0.0

    if property_type.lower() in haystack:
        score += 15
    for hint in PROPERTY_HINTS.get(property_type.lower(), [property_type.lower()]):
        if hint in haystack:
            score += 8

    price = extract_price(haystack)
    if price is not None:
        if price <= max_price:
            score += 25
        else:
            score -= min(20, (price - max_price) / 100000 * 5)

    town, distance = _guess_town(haystack)
    if town:
        score += 12
        if distance is not None:
            score += max(0, 18 - distance)
            if distance <= radius_km:
                score += 10
            else:
                score -= (distance - radius_km) * 1.5

    if any(token in haystack for token in ["jardin", "terrasse", "garage", "piscine"]):
        score += 4

    return round(score, 2)


def fetch_bing_rss(query: str, timeout: int = 20) -> str:
    response = requests.get(
        "https://www.bing.com/search",
        params={"format": "rss", "q": query},
        headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"},
        timeout=timeout,
    )
    response.raise_for_status()
    return response.text


def parse_bing_rss_results(xml_text: str) -> list[dict[str, str]]:
    root = ET.fromstring(xml_text)
    results: list[dict[str, str]] = []
    for item in root.findall("./channel/item"):
        title = (item.findtext("title") or "").strip()
        link = (item.findtext("link") or "").strip()
        description = (item.findtext("description") or "").strip()
        results.append({"title": title, "url": link, "snippet": description})
    return results


def search_properties(
    center: str = "Montpellier",
    radius_km: float = 20,
    max_price: int = 400000,
    property_type: str = "maison",
    sources: Iterable[str] | None = None,
    limit: int = 20,
) -> list[SearchResult]:
    seen: set[str] = set()
    collected: list[SearchResult] = []
    for query in build_queries(center=center, radius_km=radius_km, max_price=max_price, property_type=property_type, sources=sources):
        xml_text = fetch_bing_rss(query)
        for item in parse_bing_rss_results(xml_text):
            url = item["url"]
            if not url or url in seen:
                continue
            seen.add(url)
            blob = f"{item['title']} {item['snippet']}"
            price = extract_price(blob)
            town, distance = _guess_town(blob)
            source = urlparse(url).netloc.replace("www.", "")
            result = SearchResult(
                title=item["title"],
                url=url,
                snippet=item["snippet"],
                source=source,
                price=price,
                town=town,
                distance_km=distance,
            )
            result.score = score_result(item, center=center, radius_km=radius_km, max_price=max_price, property_type=property_type)
            collected.append(result)

    collected.sort(key=lambda r: (r.score, -(r.price or 0)), reverse=True)
    return collected[:limit]