Core API¶

`earthcatalog.catalog.EarthCatalog` ¶

Simplified facade for querying an EarthCatalog.

Combines PyIceberg catalog, table, and CatalogInfo into a single interface for spatial/temporal queries with automatic file pruning.

Example::

from earthcatalog import open as ec_open
from obstore.store import S3Store
from shapely.geometry import Point

store = S3Store(bucket='my-bucket', region='us-west-2')
ec = ec_open(store=store, base='s3://my-bucket/catalog')

point = Point(-133.99, 58.74)
paths = ec.search_files(point, start_datetime='2020-01-01')

Source code in earthcatalog/catalog.py

class EarthCatalog:
    """Simplified facade for querying an EarthCatalog.

    Combines PyIceberg catalog, table, and CatalogInfo into a single interface
    for spatial/temporal queries with automatic file pruning.

    Example::

        from earthcatalog import open as ec_open
        from obstore.store import S3Store
        from shapely.geometry import Point

        store = S3Store(bucket='my-bucket', region='us-west-2')
        ec = ec_open(store=store, base='s3://my-bucket/catalog')

        point = Point(-133.99, 58.74)
        paths = ec.search_files(point, start_datetime='2020-01-01')
    """

    def __init__(
        self,
        catalog: object,
        table: Table,
        info: CatalogInfo,
        store: object | None = None,
        *,
        catalog_key: str | None = None,
    ):
        """Initialize an EarthCatalog facade.

        Args:
            catalog: PyIceberg SqlCatalog instance
            table: PyIceberg Table instance
            info: CatalogInfo with grid metadata
            store: obstore Store instance (for reading hash index from S3)
            catalog_key: Key within *store* where catalog.db is persisted.
                         Required for ingest() which needs to upload changes.
        """
        self._catalog = catalog
        self._table = table
        self._info = info
        self._store = store
        self._catalog_key = catalog_key

    def search_files(
        self,
        geom,
        start_datetime: str | datetime | None = None,
        end_datetime: str | datetime | None = None,
    ) -> list[str]:
        """Return Parquet file paths for partitions intersecting *geom*."""
        return self._info.file_paths(
            self._table,
            geom,
            start_datetime=start_datetime,
            end_datetime=end_datetime,
        )

    def search(self, **kwargs):
        """Search across the catalog, returning a deferred ``EarthCatalogItemSearch``.

        Accepts the same kwargs as :func:`rustac.search`:
        ``intersects``, ``bbox``, ``datetime``, ``filter`` (CQL2 JSON),
        ``ids``, ``collections``, ``max_items``, ``limit``, ``sortby``,
        ``include``, ``exclude``, ``query``, etc.

        Use the top-level ``datetime`` kwarg for temporal filtering.  Do
        **not** reference ``datetime`` inside the CQL2 ``filter`` —
        rustac generates broken SQL when ``datetime`` appears in a CQL2
        expression.

        Performance
        -----------
        For fastest results use :meth:`duck_search` with ``format="native"``
        (DuckDB parallel I/O, ~2× faster across all query types).
        ``search()`` and ``search_to_arrow()`` use rustac (sequential per-file)
        and have comparable speed.  See :doc:`/operations/search_performance`
        for detailed benchmarks.

        Returns
        -------
        EarthCatalogItemSearch
            A lazy, pystac_client-compatible search result.  No I/O until
            ``items()``, ``item_collection()``, or ``pages()`` is called.
        """
        from .search import EarthCatalogItemSearch, _FileSearchEngine

        engine = _FileSearchEngine(prune_fn=self._search_prune)
        return EarthCatalogItemSearch(
            params=kwargs,
            engine=engine,
            table=self._table,
            anonymous_ctx=self._cleared_env_s3,
        )

    def search_to_arrow(self, **kwargs):
        """Search across the catalog, returning a PyArrow table."""
        from .search import _FileSearchEngine

        engine = _FileSearchEngine(prune_fn=self._search_prune)
        with self._cleared_env_s3():
            return engine.search_to_arrow(**kwargs)

    def search_uris(self, **kwargs):
        """Return asset URIs as a DataFrame with ``(id, uri)`` columns.

        Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
        ``datetime``, ``filter``, ``max_items``, etc.).

        Uses ``search_files()`` + DuckDB internally, reading **only** the
        ``id`` and ``assets`` columns from S3 — fastest way to get download
        URLs for thousands of items.  Returns a ``pandas.DataFrame``.

        Examples::

            import cql2
            df = catalog.search_uris(
                intersects={"type": "Point", "coordinates": [-45, 70]},
                datetime="2020-01-01/2020-12-31",
                filter=cql2.parse_text("percent_valid_pixels >= 80").to_json(),
                max_items=100,
            )
            # df has columns: id, uri
            for _, row in df.iterrows():
                print(row.id, row.uri)
        """
        import json

        import duckdb
        from shapely.geometry import shape

        from .search import _extract_datetime_range

        # --- geometry ---
        geom = None
        if "intersects" in kwargs:
            geom = shape(kwargs["intersects"])
        elif "bbox" in kwargs:
            from shapely.geometry import box

            b = kwargs["bbox"]
            geom = box(b[0], b[1], b[2], b[3])

        # --- Iceberg pruning ---
        start_dt, end_dt = _extract_datetime_range(**kwargs)
        paths = self._info.file_paths(
            self._table,
            geom,
            start_datetime=start_dt,
            end_datetime=end_dt,
        )
        if not paths:
            import pandas as pd

            return pd.DataFrame({"id": [], "uri": []})

        # --- build SQL (read only id + assets) ---
        path_list = ", ".join(repr(p) for p in paths)
        conditions: list[str] = []
        if geom is not None:
            conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
        if start_dt is not None:
            conditions.append(f"datetime >= '{start_dt}'")
        if end_dt is not None:
            conditions.append(f"datetime <= '{end_dt}'")
        raw_filter = kwargs.get("filter")
        if raw_filter is not None:
            from .search import _cql2_to_sql

            conditions.append(_cql2_to_sql(raw_filter))
        where = " AND ".join(conditions) if conditions else "TRUE"
        max_items = kwargs.get("max_items")

        sql = f"""SELECT id, assets FROM read_parquet([{path_list}]) WHERE {where}"""

        # --- execute (Arrow → list is faster than pandas iterrows) ---
        con = duckdb.connect()
        con.execute("INSTALL spatial; LOAD spatial;")
        con.execute("SET s3_access_key_id='';")
        con.execute("SET s3_secret_access_key='';")
        con.execute("SET s3_session_token='';")
        table = con.execute(sql).to_arrow_table()
        if max_items is not None and table.num_rows > max_items:
            table = table.slice(0, max_items)

        # --- extract data URIs from JSON assets ---

        ids = table.column("id").to_pylist()
        assets_list = table.column("assets").to_pylist()
        uris = []
        for a in assets_list:
            href = None
            if a:
                try:
                    href = json.loads(a).get("data", {}).get("href")
                except (json.JSONDecodeError, AttributeError):
                    pass
            uris.append(href)

        import pandas as pd

        return pd.DataFrame({"id": ids, "uri": uris})

    def duck_search(self, **kwargs):
        """Search using DuckDB, returning results as a ``pandas.DataFrame``.

        Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
        ``datetime``, ``filter``, ``max_items``, etc.).

        DuckDB reads Parquet files in parallel internally, making this
        **~2× faster** than :meth:`search` across all query types.
        Returns a DataFrame with flat columns — no pystac conversion
        overhead.  For pystac Items use :meth:`search` (lazy iteration).

        Examples::

            df = catalog.duck_search(
                intersects={"type": "Point", "coordinates": [-45, 70]},
                datetime="1980-01-01/2015-12-31",
                max_items=100,
            )
            # df is a pandas.DataFrame
            print(df.columns.tolist())
        """
        import duckdb
        from shapely.geometry import shape

        from .search import _cql2_to_sql, _extract_datetime_range

        geom = None
        if "intersects" in kwargs:
            geom = shape(kwargs["intersects"])
        elif "bbox" in kwargs:
            from shapely.geometry import box

            b = kwargs["bbox"]
            geom = box(b[0], b[1], b[2], b[3])

        start_dt, end_dt = _extract_datetime_range(**kwargs)
        paths = self._info.file_paths(
            self._table, geom, start_datetime=start_dt, end_datetime=end_dt
        )
        if not paths:
            import pandas as pd

            return pd.DataFrame()

        path_list = ", ".join(repr(p) for p in paths)
        conditions: list[str] = []
        if geom is not None:
            conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
        if start_dt is not None:
            conditions.append(f"datetime >= '{start_dt}'")
        if end_dt is not None:
            conditions.append(f"datetime <= '{end_dt}'")
        raw_filter = kwargs.get("filter")
        if raw_filter is not None:
            conditions.append(_cql2_to_sql(raw_filter))

        where = " AND ".join(conditions) if conditions else "TRUE"
        max_items = kwargs.get("max_items")
        # LIMIT omitted — triggers 7× slower plan for multi-file reads
        sql = f"SELECT * FROM read_parquet([{path_list}]) WHERE {where}"

        con = duckdb.connect()
        con.execute("INSTALL spatial; LOAD spatial;")
        con.execute("SET s3_access_key_id='';")
        con.execute("SET s3_secret_access_key='';")
        con.execute("SET s3_session_token='';")
        df = con.execute(sql).fetchdf()
        if max_items is not None and len(df) > max_items:
            df = df.head(max_items)
        return df

    def _search_prune(self, geom, start_datetime=None, end_datetime=None):
        """Prune warehouse files via Iceberg partition metadata (zero I/O)."""
        return self._info.file_paths(
            self._table, geom, start_datetime=start_datetime, end_datetime=end_datetime
        )

    def _cleared_env_s3(self):
        """Context manager: clear AWS cred env vars so rustac/DuckDB use unsigned requests.

        rustac and DuckDB read ``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` from the
        environment rather than using the obstore store's auth.  When the store was created
        as anonymous (``skip_signature``) or the environment has no credentials, this
        context manager temporarily removes them and sets ``AWS_NO_SIGN_REQUEST=yes``.
        """
        import os
        from contextlib import contextmanager

        anonymous = not os.environ.get("AWS_ACCESS_KEY_ID")
        if not anonymous and self._store is not None and hasattr(self._store, "config"):
            anonymous = self._store.config.get("skip_signature") in (True, "true")

        @contextmanager
        def _ctx():
            if not anonymous:
                yield
                return
            saved = {
                "AWS_ACCESS_KEY_ID": os.environ.pop("AWS_ACCESS_KEY_ID", None),
                "AWS_SECRET_ACCESS_KEY": os.environ.pop("AWS_SECRET_ACCESS_KEY", None),
                "AWS_SESSION_TOKEN": os.environ.pop("AWS_SESSION_TOKEN", None),
            }
            os.environ["AWS_NO_SIGN_REQUEST"] = "yes"
            try:
                yield
            finally:
                os.environ.pop("AWS_NO_SIGN_REQUEST", None)
                for k, v in saved.items():
                    if v is not None:
                        os.environ[k] = v

        return _ctx()

    def stats(self) -> list[dict]:
        """Return per-partition row counts and file sizes from Iceberg metadata."""
        return self._info.stats(self._table)

    def unique_item_count(self) -> int:
        """Return the count of unique STAC items from the hash index."""
        default_hash_index_path = None
        if self._catalog is not None:
            warehouse = self._catalog.properties.get("warehouse", "")
            if warehouse:
                default_hash_index_path = warehouse.rstrip("/") + "_id_hashes.parquet"

        return self._info.unique_item_count(self._table, self._store, default_hash_index_path)

    def info(self) -> CatalogInfo:
        """Return the grid metadata and catalog statistics object."""
        return self._info

    def ingest(
        self,
        inventory_path: str,
        *,
        mode: str = "auto",
        chunk_size: int = 10000,
        limit: int | None = None,
        since: datetime | None = None,
        update_hash_index: bool = False,
    ) -> dict:
        """Ingest STAC items from an S3 Inventory into the catalog.

        Unified entry point replacing both ``backfill.run_backfill`` and
        ``incremental.run``.  Handles full backfill (drop+recreate table)
        and delta append (add files to existing table).

        The caller is responsible for holding an S3Lock around this call
        when running against a shared store (use ``self.lock()``).
        """
        import os
        import uuid
        from concurrent.futures import ThreadPoolExecutor

        from earthcatalog.grids import build_partitioner
        from earthcatalog.pipelines.incremental import _fetch_item, _iter_inventory

        from .hash_index import (
            merge_hashes_from_parquets,
            read_hashes,
            write_hashes,
        )
        from .transform import (
            fan_out,
            group_by_partition,
            write_geoparquet_s3,
        )

        if not os.environ.get("AWS_ACCESS_KEY_ID"):
            raise RuntimeError(
                "No AWS credentials found in environment. "
                "ingest() requires write access to S3. "
                "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
            )

        if mode == "auto":
            try:
                n = sum(s["row_count"] for s in self._info.stats(self._table))
                mode = "delta" if n > 0 else "full"
            except Exception:
                mode = "full"

        is_delta = mode == "delta"

        from earthcatalog.config import GridConfig

        grid_cfg = GridConfig(
            type=self._info.grid_type,
            resolution=self._info.grid_resolution,
            boundaries_path=self._info.boundaries_path,
            id_field=self._info.id_field,
        )
        partitioner = build_partitioner(grid_cfg)

        warehouse_root = self._catalog.properties.get("warehouse", "")
        uri = self._catalog.properties.get("uri", "")
        local_db = uri.removeprefix("sqlite:///") if uri else "/tmp/earthcatalog.db"

        if self._store and self._catalog_key:
            self.download_catalog(local_db)

        if not is_delta:
            from pyiceberg.exceptions import NoSuchTableError

            try:
                self._catalog.drop_table(FULL_NAME)
            except NoSuchTableError:
                pass
            try:
                self._catalog.create_namespace(NAMESPACE)
            except Exception:
                pass
            self._table = get_or_create(self._catalog, grid_config=grid_cfg)

        total_items = 0
        total_rows = 0
        written_keys: list[str] = []
        batch: list[tuple[str, str]] = []

        def _flush(chunk: list[tuple[str, str]]) -> None:
            nonlocal total_rows

            with ThreadPoolExecutor(max_workers=16) as pool:
                items = list(filter(None, pool.map(lambda bc: _fetch_item(*bc), chunk)))

            if not items:
                return

            fo = fan_out(items, partitioner)
            if not fo:
                return

            for (cell, year), group_items in group_by_partition(fo).items():
                year_str = str(year) if year is not None else "unknown"
                part_tag = uuid.uuid4().hex[:8]
                s3_key = f"grid_partition={cell}/year={year_str}/part_{part_tag}.parquet"
                n, _ = write_geoparquet_s3(group_items, self._store, s3_key)
                if n > 0:
                    written_keys.append(s3_key)
                    total_rows += n

        print(f"Ingesting from: {inventory_path}")
        for bucket, key in _iter_inventory(inventory_path, since=since):
            if not key.endswith(".stac.json"):
                continue
            batch.append((bucket, key))
            total_items += 1
            if len(batch) >= chunk_size:
                _flush(batch)
                batch.clear()
            if limit and total_items >= limit:
                break

        if batch:
            _flush(batch)

        if written_keys:
            full_paths = [f"{warehouse_root.rstrip('/')}/{k}" for k in written_keys]
            batch_sz = 2000
            for i in range(0, len(full_paths), batch_sz):
                self._table.add_files(full_paths[i : i + batch_sz])
            print(f"Registered {len(full_paths)} files in Iceberg catalog.")

        if update_hash_index and written_keys:
            hash_index_path = self._table.properties.get("earthcatalog.hash_index_path")
            if not hash_index_path:
                hash_index_path = f"{warehouse_root.rstrip('/')}_id_hashes.parquet"
                with self._table.transaction() as tx:
                    tx.set_properties(**{"earthcatalog.hash_index_path": hash_index_path})

            if hash_index_path.startswith("s3://"):
                import re as _re

                m = _re.match(r"s3://([^/]+)/(.+)", hash_index_path)
                if m:
                    hash_key = m.group(2)
                    existing = read_hashes(self._store, hash_key)
                    print(f"  Existing hashes: {len(existing):,}")
                    updated, n_new = merge_hashes_from_parquets(
                        full_paths, existing, store=self._store
                    )
                    print(f"  New hashes: {n_new:,} from {len(full_paths)} files")
                    write_hashes(updated, self._store, hash_key)
            else:
                print("WARN: hash index update skipped — only s3:// paths supported")

        if self._store and self._catalog_key:
            self.upload_catalog(local_db)

        result = {
            "items_processed": total_items,
            "rows_written": total_rows,
            "files_registered": len(written_keys),
        }
        print(f"Done. {total_items} items -> {total_rows} rows in {len(written_keys)} files")
        return result

    def bulk_ingest(
        self,
        inventory_path: str,
        *,
        mode: str = "auto",
        chunk_size: int = 100_000,
        compact_rows: int = 100_000,
        limit: int | None = None,
        since: datetime | None = None,
        update_hash_index: bool = False,
        staging_prefix: str | None = None,
        create_client: Callable[[], object] | None = None,
        skip_inventory: bool = False,
        skip_ingest: bool = False,
        retry_pending: bool = False,
    ) -> None:
        """Ingest large inventories using a distributed Dask cluster."""
        import os
        from datetime import UTC
        from datetime import datetime as _dt

        from earthcatalog.config import GridConfig
        from earthcatalog.grids import build_partitioner
        from earthcatalog.pipelines.backfill import run_backfill

        if not os.environ.get("AWS_ACCESS_KEY_ID"):
            raise RuntimeError(
                "No AWS credentials found in environment. "
                "bulk_ingest() requires write access to S3. "
                "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
            )

        warehouse_root = self._catalog.properties.get("warehouse", "")
        uri = self._catalog.properties.get("uri", "")
        local_db = uri.removeprefix("sqlite:///")

        grid_cfg = GridConfig(
            type=self._info.grid_type,
            resolution=self._info.grid_resolution,
            boundaries_path=self._info.boundaries_path,
            id_field=self._info.id_field,
        )
        partitioner = build_partitioner(grid_cfg)

        if staging_prefix is None:
            date_str = _dt.now(UTC).strftime("%Y%m%d")
            staging_prefix = f"bulk_ingest/{date_str}"

        delta = True
        if mode == "full":
            delta = False
        elif mode == "auto":
            try:
                n = sum(s["row_count"] for s in self._info.stats(self._table))
                delta = n > 0
            except Exception:
                delta = False

        if self._store and self._catalog_key:
            self.download_catalog(local_db)

        from . import store_config

        old_store = store_config.get_store()
        old_key = store_config.get_catalog_key()
        try:
            store_config.set_store(self._store)
            if self._catalog_key:
                store_config.set_catalog_key(self._catalog_key)

            run_backfill(
                inventory_path=inventory_path,
                catalog_path=local_db,
                staging_store=self._store,
                staging_prefix=staging_prefix,
                warehouse_store=self._store,
                warehouse_root=warehouse_root,
                partitioner=partitioner,
                chunk_size=chunk_size,
                compact_rows=compact_rows,
                limit=limit,
                since=since,
                use_lock=False,
                upload=True,
                skip_inventory=skip_inventory,
                skip_ingest=skip_ingest,
                retry_pending=retry_pending,
                delta=delta,
                create_client=create_client,
                update_hash_index=update_hash_index,
                hash_index_path=self._table.properties.get("earthcatalog.hash_index_path"),
            )
        finally:
            store_config.set_store(old_store)
            store_config.set_catalog_key(old_key)

    def download_catalog(self, local_path: str) -> None:
        """Download catalog.db from the backing store to *local_path*."""
        download_catalog(local_path, store=self._store)

    def upload_catalog(self, local_path: str) -> None:
        """Upload catalog.db from *local_path* to the backing store."""
        upload_catalog(local_path, store=self._store)

    def compact(
        self,
        threshold: int = 2,
        dry_run: bool = False,
    ) -> dict[str, int]:
        """Compact over-threshold partition buckets and rebuild the Iceberg catalog.

        Wraps :func:`earthcatalog.maintenance.compact.compact_warehouse` using this
        catalog's warehouse path and local catalog database.

        Parameters
        ----------
        threshold:
            Minimum number of part files in a bucket before it is compacted.
            Default: 2 (compact any bucket with more than one part file).
        dry_run:
            When ``True``, report what *would* be compacted but make no changes.

        Returns
        -------
        Summary dict with keys ``buckets_scanned``, ``buckets_compacted``,
        ``files_before``, ``files_after``.
        """
        from earthcatalog.maintenance.compact import compact_warehouse

        warehouse_path = self._catalog.properties.get("warehouse", "")
        uri = self._catalog.properties.get("uri", "")
        local_db = uri.removeprefix("sqlite:///")
        return compact_warehouse(
            warehouse_path=warehouse_path,
            catalog_path=local_db,
            threshold=threshold,
            dry_run=dry_run,
        )

    def lock(self, owner: str, ttl_hours: int = 12):
        """Return an S3Lock that uses this EarthCatalog's store and key."""
        from .lock import S3Lock

        lock_key = getattr(self._catalog, "_lock_key", None) or ".lock"
        return S3Lock(owner=owner, ttl_hours=ttl_hours, store=self._store, key=lock_key)

    def cells_for_geometry(self, geom) -> list[str]:
        """Return the partition keys that intersect *geom*."""
        return self._info.cells_for_geometry(geom)

    def cell_list_sql(self, geom) -> str:
        """Return a SQL fragment suitable for ``WHERE grid_partition IN (...)``."""
        return self._info.cell_list_sql(geom)

    @property
    def grid_type(self) -> str:
        """Return the grid partitioning system type."""
        return self._info.grid_type

    @property
    def grid_resolution(self) -> int | None:
        """Return the H3/S2 resolution (None for GeoJSON grids)."""
        return self._info.grid_resolution

    @property
    def table(self):
        """Return the underlying PyIceberg Table (for advanced use)."""
        return self._table

    def _repr_html_(self) -> str:
        """Return an HTML representation for Jupyter notebooks.

        Single-column layout with metadata table and collapsible top partitions.
        Reads only Iceberg manifests — no Parquet data is scanned.
        """
        rows = [("Grid type", self._info.grid_type)]

        if self._info.grid_type == "h3":
            rows.append(("H3 resolution", str(self._info.grid_resolution)))
        else:
            rows.append(("Boundaries", self._info.boundaries_path or "N/A"))

        warehouse_path = self._catalog.properties.get("warehouse", "") if self._catalog else ""
        if warehouse_path:
            rows.append(("Warehouse", warehouse_path))

        hash_idx = self._table.properties.get("earthcatalog.hash_index_path")
        rows.append(("Hash index", "Available" if hash_idx else "Not available"))

        table_html = "<table style='border-collapse: collapse; width: 100%; margin: 0;'>"
        for label, value in rows:
            table_html += f"""
                <tr style='border-bottom: 1px solid currentColor;'>
                    <td style='padding: 6px 10px; border: none; width: 180px;'>{label}</td>
                    <td style='padding: 6px 10px; border: none;'><strong>{value}</strong></td>
                </tr>"""
        table_html += "</table>"

        stats = self._info.stats(self._table)
        bottom_html = ""
        if stats:
            total_files = self._info.total_files(self._table)
            total_rows = sum(s["row_count"] for s in stats)
            warehouse = self._catalog.properties.get("warehouse", "") if self._catalog else ""
            default_hi = warehouse.rstrip("/") + "_id_hashes.parquet" if warehouse else None
            unique = self._info.unique_item_count(self._table, self._store, default_hi)

            stat_rows = [
                ("Total files", f"{total_files:,}"),
                ("Total rows", f"{total_rows:,}"),
                ("Unique items", f"{unique:,}"),
                ("Partitions", f"{len(stats):,}"),
            ]
            stats_table = "<table style='border-collapse: collapse; width: 100%; font-size: 13px; margin: 0;'>"
            for label, value in stat_rows:
                stats_table += f"""
                    <tr style='border-bottom: 1px solid currentColor;'>
                        <td style='padding: 4px 6px; border: none; width: 180px;'>{label}</td>
                        <td style='padding: 4px 6px; border: none;'><strong>{value}</strong></td>
                    </tr>"""
            stats_table += "</table>"

            top_cells = self._info.top_cells(self._table, limit=3)
            top_html = ""
            if top_cells:
                top_rows = ""
                for cell in top_cells:
                    top_rows += f"""
                        <tr style='border-bottom: 1px solid currentColor;'>
                            <td style='padding: 4px 6px; border: none; width: 180px; font-family: monospace;'>{cell["grid_partition"][:12]}...</td>
                            <td style='padding: 4px 6px; border: none;'>{cell["row_count"]:,} rows</td>
                        </tr>"""
                top_html = f"""
                <details style='margin-top: 12px;'>
                    <summary style='font-weight: 600; cursor: pointer;'>Top partitions</summary>
                    <table style='border-collapse: collapse; width: 100%; font-size: 13px; margin: 8px 0 0 0;'>{top_rows}</table>
                </details>"""

            bottom_html = f"""
            <div style='font-weight: 600; margin-top: 12px;'>Statistics</div>
            {stats_table}
            {top_html}"""

        return f"""
        <div         style='border: 1px solid currentColor; padding: 15px; border-radius: 5px; font-family: var(--jp-code-font-family, monospace); opacity: 0.9; text-align: left; max-width: 800px;'>
            <div style='font-size: 16px; font-weight: 600; margin-bottom: 12px; display: flex; align-items: center; gap: 8px;'>
                <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round">
                    <circle cx="12" cy="12" r="10"/>
                    <path d="M2 12h20M12 2c-3.3 2-5.5 5.5-5.5 10s2.2 8 5.5 10c3.3-2 5.5-5.5 5.5-10S15.3 4 12 2z"/>
                </svg>
                <span>EarthCatalog</span>
            </div>
            {table_html}
            {bottom_html}
        </div>"""

    def __repr__(self) -> str:
        if self._info.grid_type == "h3":
            return f"EarthCatalog(grid_type='h3', resolution={self._info.grid_resolution})"
        return (
            f"EarthCatalog(grid_type='geojson', "
            f"boundaries_path={self._info.boundaries_path!r}, "
            f"id_field={self._info.id_field!r})"
        )

Attributes¶

`grid_type` `property` ¶

Return the grid partitioning system type.

`grid_resolution` `property` ¶

Return the H3/S2 resolution (None for GeoJSON grids).

`table` `property` ¶

Return the underlying PyIceberg Table (for advanced use).

Functions¶

`init(catalog, table, info, store=None, *, catalog_key=None)` ¶

Initialize an EarthCatalog facade.

Parameters:

Name	Type	Description	Default
`catalog`	`object`	PyIceberg SqlCatalog instance	required
`table`	`Table`	PyIceberg Table instance	required
`info`	`CatalogInfo`	CatalogInfo with grid metadata	required
`store`	`object \| None`	obstore Store instance (for reading hash index from S3)	`None`
`catalog_key`	`str \| None`	Key within store where catalog.db is persisted. Required for ingest() which needs to upload changes.	`None`

Source code in earthcatalog/catalog.py

def __init__(
    self,
    catalog: object,
    table: Table,
    info: CatalogInfo,
    store: object | None = None,
    *,
    catalog_key: str | None = None,
):
    """Initialize an EarthCatalog facade.

    Args:
        catalog: PyIceberg SqlCatalog instance
        table: PyIceberg Table instance
        info: CatalogInfo with grid metadata
        store: obstore Store instance (for reading hash index from S3)
        catalog_key: Key within *store* where catalog.db is persisted.
                     Required for ingest() which needs to upload changes.
    """
    self._catalog = catalog
    self._table = table
    self._info = info
    self._store = store
    self._catalog_key = catalog_key

`search_files(geom, start_datetime=None, end_datetime=None)` ¶

Return Parquet file paths for partitions intersecting geom.

Source code in earthcatalog/catalog.py

def search_files(
    self,
    geom,
    start_datetime: str | datetime | None = None,
    end_datetime: str | datetime | None = None,
) -> list[str]:
    """Return Parquet file paths for partitions intersecting *geom*."""
    return self._info.file_paths(
        self._table,
        geom,
        start_datetime=start_datetime,
        end_datetime=end_datetime,
    )

`search(**kwargs)` ¶

Search across the catalog, returning a deferred EarthCatalogItemSearch.

Accepts the same kwargs as :func:rustac.search: intersects, bbox, datetime, filter (CQL2 JSON), ids, collections, max_items, limit, sortby, include, exclude, query, etc.

Use the top-level datetime kwarg for temporal filtering. Do not reference datetime inside the CQL2 filter — rustac generates broken SQL when datetime appears in a CQL2 expression.

Performance¶

For fastest results use :meth:duck_search with format="native" (DuckDB parallel I/O, ~2× faster across all query types). search() and search_to_arrow() use rustac (sequential per-file) and have comparable speed. See :doc:/operations/search_performance for detailed benchmarks.

Returns¶

EarthCatalogItemSearch A lazy, pystac_client-compatible search result. No I/O until items(), item_collection(), or pages() is called.

Source code in earthcatalog/catalog.py

def search(self, **kwargs):
    """Search across the catalog, returning a deferred ``EarthCatalogItemSearch``.

    Accepts the same kwargs as :func:`rustac.search`:
    ``intersects``, ``bbox``, ``datetime``, ``filter`` (CQL2 JSON),
    ``ids``, ``collections``, ``max_items``, ``limit``, ``sortby``,
    ``include``, ``exclude``, ``query``, etc.

    Use the top-level ``datetime`` kwarg for temporal filtering.  Do
    **not** reference ``datetime`` inside the CQL2 ``filter`` —
    rustac generates broken SQL when ``datetime`` appears in a CQL2
    expression.

    Performance
    -----------
    For fastest results use :meth:`duck_search` with ``format="native"``
    (DuckDB parallel I/O, ~2× faster across all query types).
    ``search()`` and ``search_to_arrow()`` use rustac (sequential per-file)
    and have comparable speed.  See :doc:`/operations/search_performance`
    for detailed benchmarks.

    Returns
    -------
    EarthCatalogItemSearch
        A lazy, pystac_client-compatible search result.  No I/O until
        ``items()``, ``item_collection()``, or ``pages()`` is called.
    """
    from .search import EarthCatalogItemSearch, _FileSearchEngine

    engine = _FileSearchEngine(prune_fn=self._search_prune)
    return EarthCatalogItemSearch(
        params=kwargs,
        engine=engine,
        table=self._table,
        anonymous_ctx=self._cleared_env_s3,
    )

`search_to_arrow(**kwargs)` ¶

Search across the catalog, returning a PyArrow table.

Source code in earthcatalog/catalog.py

def search_to_arrow(self, **kwargs):
    """Search across the catalog, returning a PyArrow table."""
    from .search import _FileSearchEngine

    engine = _FileSearchEngine(prune_fn=self._search_prune)
    with self._cleared_env_s3():
        return engine.search_to_arrow(**kwargs)

`search_uris(**kwargs)` ¶

Return asset URIs as a DataFrame with (id, uri) columns.

Accepts the same kwargs as :meth:search (intersects, bbox, datetime, filter, max_items, etc.).

Uses search_files() + DuckDB internally, reading only the id and assets columns from S3 — fastest way to get download URLs for thousands of items. Returns a pandas.DataFrame.

Examples::

import cql2
df = catalog.search_uris(
    intersects={"type": "Point", "coordinates": [-45, 70]},
    datetime="2020-01-01/2020-12-31",
    filter=cql2.parse_text("percent_valid_pixels >= 80").to_json(),
    max_items=100,
)
# df has columns: id, uri
for _, row in df.iterrows():
    print(row.id, row.uri)

Source code in earthcatalog/catalog.py

def search_uris(self, **kwargs):
    """Return asset URIs as a DataFrame with ``(id, uri)`` columns.

    Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
    ``datetime``, ``filter``, ``max_items``, etc.).

    Uses ``search_files()`` + DuckDB internally, reading **only** the
    ``id`` and ``assets`` columns from S3 — fastest way to get download
    URLs for thousands of items.  Returns a ``pandas.DataFrame``.

    Examples::

        import cql2
        df = catalog.search_uris(
            intersects={"type": "Point", "coordinates": [-45, 70]},
            datetime="2020-01-01/2020-12-31",
            filter=cql2.parse_text("percent_valid_pixels >= 80").to_json(),
            max_items=100,
        )
        # df has columns: id, uri
        for _, row in df.iterrows():
            print(row.id, row.uri)
    """
    import json

    import duckdb
    from shapely.geometry import shape

    from .search import _extract_datetime_range

    # --- geometry ---
    geom = None
    if "intersects" in kwargs:
        geom = shape(kwargs["intersects"])
    elif "bbox" in kwargs:
        from shapely.geometry import box

        b = kwargs["bbox"]
        geom = box(b[0], b[1], b[2], b[3])

    # --- Iceberg pruning ---
    start_dt, end_dt = _extract_datetime_range(**kwargs)
    paths = self._info.file_paths(
        self._table,
        geom,
        start_datetime=start_dt,
        end_datetime=end_dt,
    )
    if not paths:
        import pandas as pd

        return pd.DataFrame({"id": [], "uri": []})

    # --- build SQL (read only id + assets) ---
    path_list = ", ".join(repr(p) for p in paths)
    conditions: list[str] = []
    if geom is not None:
        conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
    if start_dt is not None:
        conditions.append(f"datetime >= '{start_dt}'")
    if end_dt is not None:
        conditions.append(f"datetime <= '{end_dt}'")
    raw_filter = kwargs.get("filter")
    if raw_filter is not None:
        from .search import _cql2_to_sql

        conditions.append(_cql2_to_sql(raw_filter))
    where = " AND ".join(conditions) if conditions else "TRUE"
    max_items = kwargs.get("max_items")

    sql = f"""SELECT id, assets FROM read_parquet([{path_list}]) WHERE {where}"""

    # --- execute (Arrow → list is faster than pandas iterrows) ---
    con = duckdb.connect()
    con.execute("INSTALL spatial; LOAD spatial;")
    con.execute("SET s3_access_key_id='';")
    con.execute("SET s3_secret_access_key='';")
    con.execute("SET s3_session_token='';")
    table = con.execute(sql).to_arrow_table()
    if max_items is not None and table.num_rows > max_items:
        table = table.slice(0, max_items)

    # --- extract data URIs from JSON assets ---

    ids = table.column("id").to_pylist()
    assets_list = table.column("assets").to_pylist()
    uris = []
    for a in assets_list:
        href = None
        if a:
            try:
                href = json.loads(a).get("data", {}).get("href")
            except (json.JSONDecodeError, AttributeError):
                pass
        uris.append(href)

    import pandas as pd

    return pd.DataFrame({"id": ids, "uri": uris})

`duck_search(**kwargs)` ¶

Search using DuckDB, returning results as a pandas.DataFrame.

Accepts the same kwargs as :meth:search (intersects, bbox, datetime, filter, max_items, etc.).

DuckDB reads Parquet files in parallel internally, making this ~2× faster than :meth:search across all query types. Returns a DataFrame with flat columns — no pystac conversion overhead. For pystac Items use :meth:search (lazy iteration).

Examples::

df = catalog.duck_search(
    intersects={"type": "Point", "coordinates": [-45, 70]},
    datetime="1980-01-01/2015-12-31",
    max_items=100,
)
# df is a pandas.DataFrame
print(df.columns.tolist())

Source code in earthcatalog/catalog.py

def duck_search(self, **kwargs):
    """Search using DuckDB, returning results as a ``pandas.DataFrame``.

    Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
    ``datetime``, ``filter``, ``max_items``, etc.).

    DuckDB reads Parquet files in parallel internally, making this
    **~2× faster** than :meth:`search` across all query types.
    Returns a DataFrame with flat columns — no pystac conversion
    overhead.  For pystac Items use :meth:`search` (lazy iteration).

    Examples::

        df = catalog.duck_search(
            intersects={"type": "Point", "coordinates": [-45, 70]},
            datetime="1980-01-01/2015-12-31",
            max_items=100,
        )
        # df is a pandas.DataFrame
        print(df.columns.tolist())
    """
    import duckdb
    from shapely.geometry import shape

    from .search import _cql2_to_sql, _extract_datetime_range

    geom = None
    if "intersects" in kwargs:
        geom = shape(kwargs["intersects"])
    elif "bbox" in kwargs:
        from shapely.geometry import box

        b = kwargs["bbox"]
        geom = box(b[0], b[1], b[2], b[3])

    start_dt, end_dt = _extract_datetime_range(**kwargs)
    paths = self._info.file_paths(
        self._table, geom, start_datetime=start_dt, end_datetime=end_dt
    )
    if not paths:
        import pandas as pd

        return pd.DataFrame()

    path_list = ", ".join(repr(p) for p in paths)
    conditions: list[str] = []
    if geom is not None:
        conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
    if start_dt is not None:
        conditions.append(f"datetime >= '{start_dt}'")
    if end_dt is not None:
        conditions.append(f"datetime <= '{end_dt}'")
    raw_filter = kwargs.get("filter")
    if raw_filter is not None:
        conditions.append(_cql2_to_sql(raw_filter))

    where = " AND ".join(conditions) if conditions else "TRUE"
    max_items = kwargs.get("max_items")
    # LIMIT omitted — triggers 7× slower plan for multi-file reads
    sql = f"SELECT * FROM read_parquet([{path_list}]) WHERE {where}"

    con = duckdb.connect()
    con.execute("INSTALL spatial; LOAD spatial;")
    con.execute("SET s3_access_key_id='';")
    con.execute("SET s3_secret_access_key='';")
    con.execute("SET s3_session_token='';")
    df = con.execute(sql).fetchdf()
    if max_items is not None and len(df) > max_items:
        df = df.head(max_items)
    return df

`stats()` ¶

Return per-partition row counts and file sizes from Iceberg metadata.

Source code in earthcatalog/catalog.py

def stats(self) -> list[dict]:
    """Return per-partition row counts and file sizes from Iceberg metadata."""
    return self._info.stats(self._table)

`unique_item_count()` ¶

Return the count of unique STAC items from the hash index.

Source code in earthcatalog/catalog.py

def unique_item_count(self) -> int:
    """Return the count of unique STAC items from the hash index."""
    default_hash_index_path = None
    if self._catalog is not None:
        warehouse = self._catalog.properties.get("warehouse", "")
        if warehouse:
            default_hash_index_path = warehouse.rstrip("/") + "_id_hashes.parquet"

    return self._info.unique_item_count(self._table, self._store, default_hash_index_path)

`info()` ¶

Return the grid metadata and catalog statistics object.

Source code in earthcatalog/catalog.py

def info(self) -> CatalogInfo:
    """Return the grid metadata and catalog statistics object."""
    return self._info

`ingest(inventory_path, *, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False)` ¶

Ingest STAC items from an S3 Inventory into the catalog.

Unified entry point replacing both backfill.run_backfill and incremental.run. Handles full backfill (drop+recreate table) and delta append (add files to existing table).

The caller is responsible for holding an S3Lock around this call when running against a shared store (use self.lock()).

Source code in earthcatalog/catalog.py

def ingest(
    self,
    inventory_path: str,
    *,
    mode: str = "auto",
    chunk_size: int = 10000,
    limit: int | None = None,
    since: datetime | None = None,
    update_hash_index: bool = False,
) -> dict:
    """Ingest STAC items from an S3 Inventory into the catalog.

    Unified entry point replacing both ``backfill.run_backfill`` and
    ``incremental.run``.  Handles full backfill (drop+recreate table)
    and delta append (add files to existing table).

    The caller is responsible for holding an S3Lock around this call
    when running against a shared store (use ``self.lock()``).
    """
    import os
    import uuid
    from concurrent.futures import ThreadPoolExecutor

    from earthcatalog.grids import build_partitioner
    from earthcatalog.pipelines.incremental import _fetch_item, _iter_inventory

    from .hash_index import (
        merge_hashes_from_parquets,
        read_hashes,
        write_hashes,
    )
    from .transform import (
        fan_out,
        group_by_partition,
        write_geoparquet_s3,
    )

    if not os.environ.get("AWS_ACCESS_KEY_ID"):
        raise RuntimeError(
            "No AWS credentials found in environment. "
            "ingest() requires write access to S3. "
            "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
        )

    if mode == "auto":
        try:
            n = sum(s["row_count"] for s in self._info.stats(self._table))
            mode = "delta" if n > 0 else "full"
        except Exception:
            mode = "full"

    is_delta = mode == "delta"

    from earthcatalog.config import GridConfig

    grid_cfg = GridConfig(
        type=self._info.grid_type,
        resolution=self._info.grid_resolution,
        boundaries_path=self._info.boundaries_path,
        id_field=self._info.id_field,
    )
    partitioner = build_partitioner(grid_cfg)

    warehouse_root = self._catalog.properties.get("warehouse", "")
    uri = self._catalog.properties.get("uri", "")
    local_db = uri.removeprefix("sqlite:///") if uri else "/tmp/earthcatalog.db"

    if self._store and self._catalog_key:
        self.download_catalog(local_db)

    if not is_delta:
        from pyiceberg.exceptions import NoSuchTableError

        try:
            self._catalog.drop_table(FULL_NAME)
        except NoSuchTableError:
            pass
        try:
            self._catalog.create_namespace(NAMESPACE)
        except Exception:
            pass
        self._table = get_or_create(self._catalog, grid_config=grid_cfg)

    total_items = 0
    total_rows = 0
    written_keys: list[str] = []
    batch: list[tuple[str, str]] = []

    def _flush(chunk: list[tuple[str, str]]) -> None:
        nonlocal total_rows

        with ThreadPoolExecutor(max_workers=16) as pool:
            items = list(filter(None, pool.map(lambda bc: _fetch_item(*bc), chunk)))

        if not items:
            return

        fo = fan_out(items, partitioner)
        if not fo:
            return

        for (cell, year), group_items in group_by_partition(fo).items():
            year_str = str(year) if year is not None else "unknown"
            part_tag = uuid.uuid4().hex[:8]
            s3_key = f"grid_partition={cell}/year={year_str}/part_{part_tag}.parquet"
            n, _ = write_geoparquet_s3(group_items, self._store, s3_key)
            if n > 0:
                written_keys.append(s3_key)
                total_rows += n

    print(f"Ingesting from: {inventory_path}")
    for bucket, key in _iter_inventory(inventory_path, since=since):
        if not key.endswith(".stac.json"):
            continue
        batch.append((bucket, key))
        total_items += 1
        if len(batch) >= chunk_size:
            _flush(batch)
            batch.clear()
        if limit and total_items >= limit:
            break

    if batch:
        _flush(batch)

    if written_keys:
        full_paths = [f"{warehouse_root.rstrip('/')}/{k}" for k in written_keys]
        batch_sz = 2000
        for i in range(0, len(full_paths), batch_sz):
            self._table.add_files(full_paths[i : i + batch_sz])
        print(f"Registered {len(full_paths)} files in Iceberg catalog.")

    if update_hash_index and written_keys:
        hash_index_path = self._table.properties.get("earthcatalog.hash_index_path")
        if not hash_index_path:
            hash_index_path = f"{warehouse_root.rstrip('/')}_id_hashes.parquet"
            with self._table.transaction() as tx:
                tx.set_properties(**{"earthcatalog.hash_index_path": hash_index_path})

        if hash_index_path.startswith("s3://"):
            import re as _re

            m = _re.match(r"s3://([^/]+)/(.+)", hash_index_path)
            if m:
                hash_key = m.group(2)
                existing = read_hashes(self._store, hash_key)
                print(f"  Existing hashes: {len(existing):,}")
                updated, n_new = merge_hashes_from_parquets(
                    full_paths, existing, store=self._store
                )
                print(f"  New hashes: {n_new:,} from {len(full_paths)} files")
                write_hashes(updated, self._store, hash_key)
        else:
            print("WARN: hash index update skipped — only s3:// paths supported")

    if self._store and self._catalog_key:
        self.upload_catalog(local_db)

    result = {
        "items_processed": total_items,
        "rows_written": total_rows,
        "files_registered": len(written_keys),
    }
    print(f"Done. {total_items} items -> {total_rows} rows in {len(written_keys)} files")
    return result

`bulk_ingest(inventory_path, *, mode='auto', chunk_size=100000, compact_rows=100000, limit=None, since=None, update_hash_index=False, staging_prefix=None, create_client=None, skip_inventory=False, skip_ingest=False, retry_pending=False)` ¶

Ingest large inventories using a distributed Dask cluster.

Source code in earthcatalog/catalog.py

def bulk_ingest(
    self,
    inventory_path: str,
    *,
    mode: str = "auto",
    chunk_size: int = 100_000,
    compact_rows: int = 100_000,
    limit: int | None = None,
    since: datetime | None = None,
    update_hash_index: bool = False,
    staging_prefix: str | None = None,
    create_client: Callable[[], object] | None = None,
    skip_inventory: bool = False,
    skip_ingest: bool = False,
    retry_pending: bool = False,
) -> None:
    """Ingest large inventories using a distributed Dask cluster."""
    import os
    from datetime import UTC
    from datetime import datetime as _dt

    from earthcatalog.config import GridConfig
    from earthcatalog.grids import build_partitioner
    from earthcatalog.pipelines.backfill import run_backfill

    if not os.environ.get("AWS_ACCESS_KEY_ID"):
        raise RuntimeError(
            "No AWS credentials found in environment. "
            "bulk_ingest() requires write access to S3. "
            "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
        )

    warehouse_root = self._catalog.properties.get("warehouse", "")
    uri = self._catalog.properties.get("uri", "")
    local_db = uri.removeprefix("sqlite:///")

    grid_cfg = GridConfig(
        type=self._info.grid_type,
        resolution=self._info.grid_resolution,
        boundaries_path=self._info.boundaries_path,
        id_field=self._info.id_field,
    )
    partitioner = build_partitioner(grid_cfg)

    if staging_prefix is None:
        date_str = _dt.now(UTC).strftime("%Y%m%d")
        staging_prefix = f"bulk_ingest/{date_str}"

    delta = True
    if mode == "full":
        delta = False
    elif mode == "auto":
        try:
            n = sum(s["row_count"] for s in self._info.stats(self._table))
            delta = n > 0
        except Exception:
            delta = False

    if self._store and self._catalog_key:
        self.download_catalog(local_db)

    from . import store_config

    old_store = store_config.get_store()
    old_key = store_config.get_catalog_key()
    try:
        store_config.set_store(self._store)
        if self._catalog_key:
            store_config.set_catalog_key(self._catalog_key)

        run_backfill(
            inventory_path=inventory_path,
            catalog_path=local_db,
            staging_store=self._store,
            staging_prefix=staging_prefix,
            warehouse_store=self._store,
            warehouse_root=warehouse_root,
            partitioner=partitioner,
            chunk_size=chunk_size,
            compact_rows=compact_rows,
            limit=limit,
            since=since,
            use_lock=False,
            upload=True,
            skip_inventory=skip_inventory,
            skip_ingest=skip_ingest,
            retry_pending=retry_pending,
            delta=delta,
            create_client=create_client,
            update_hash_index=update_hash_index,
            hash_index_path=self._table.properties.get("earthcatalog.hash_index_path"),
        )
    finally:
        store_config.set_store(old_store)
        store_config.set_catalog_key(old_key)

`download_catalog(local_path)` ¶

Download catalog.db from the backing store to local_path.

Source code in earthcatalog/catalog.py

def download_catalog(self, local_path: str) -> None:
    """Download catalog.db from the backing store to *local_path*."""
    download_catalog(local_path, store=self._store)

`upload_catalog(local_path)` ¶

Upload catalog.db from local_path to the backing store.

Source code in earthcatalog/catalog.py

def upload_catalog(self, local_path: str) -> None:
    """Upload catalog.db from *local_path* to the backing store."""
    upload_catalog(local_path, store=self._store)

`compact(threshold=2, dry_run=False)` ¶

Compact over-threshold partition buckets and rebuild the Iceberg catalog.

Wraps :func:earthcatalog.maintenance.compact.compact_warehouse using this catalog's warehouse path and local catalog database.

Parameters¶

threshold: Minimum number of part files in a bucket before it is compacted. Default: 2 (compact any bucket with more than one part file). dry_run: When True, report what would be compacted but make no changes.

Returns¶

Summary dict with keys buckets_scanned, buckets_compacted, files_before, files_after.

Source code in earthcatalog/catalog.py

def compact(
    self,
    threshold: int = 2,
    dry_run: bool = False,
) -> dict[str, int]:
    """Compact over-threshold partition buckets and rebuild the Iceberg catalog.

    Wraps :func:`earthcatalog.maintenance.compact.compact_warehouse` using this
    catalog's warehouse path and local catalog database.

    Parameters
    ----------
    threshold:
        Minimum number of part files in a bucket before it is compacted.
        Default: 2 (compact any bucket with more than one part file).
    dry_run:
        When ``True``, report what *would* be compacted but make no changes.

    Returns
    -------
    Summary dict with keys ``buckets_scanned``, ``buckets_compacted``,
    ``files_before``, ``files_after``.
    """
    from earthcatalog.maintenance.compact import compact_warehouse

    warehouse_path = self._catalog.properties.get("warehouse", "")
    uri = self._catalog.properties.get("uri", "")
    local_db = uri.removeprefix("sqlite:///")
    return compact_warehouse(
        warehouse_path=warehouse_path,
        catalog_path=local_db,
        threshold=threshold,
        dry_run=dry_run,
    )

`lock(owner, ttl_hours=12)` ¶

Return an S3Lock that uses this EarthCatalog's store and key.

Source code in earthcatalog/catalog.py

def lock(self, owner: str, ttl_hours: int = 12):
    """Return an S3Lock that uses this EarthCatalog's store and key."""
    from .lock import S3Lock

    lock_key = getattr(self._catalog, "_lock_key", None) or ".lock"
    return S3Lock(owner=owner, ttl_hours=ttl_hours, store=self._store, key=lock_key)

`cells_for_geometry(geom)` ¶

Return the partition keys that intersect geom.

Source code in earthcatalog/catalog.py

def cells_for_geometry(self, geom) -> list[str]:
    """Return the partition keys that intersect *geom*."""
    return self._info.cells_for_geometry(geom)

`cell_list_sql(geom)` ¶

Return a SQL fragment suitable for WHERE grid_partition IN (...).

Source code in earthcatalog/catalog.py

def cell_list_sql(self, geom) -> str:
    """Return a SQL fragment suitable for ``WHERE grid_partition IN (...)``."""
    return self._info.cell_list_sql(geom)

`earthcatalog.catalog` ¶

EarthCatalog — simplified facade for querying spatially-partitioned STAC catalogs.

Provides a clean API that encapsulates PyIceberg catalog, table, and grid metadata discovery into a single object.

Classes¶

`CatalogInfo` `dataclass` ¶

Grid metadata read from Iceberg table properties.

Source code in earthcatalog/catalog.py

@dataclass
class CatalogInfo:
    """Grid metadata read from Iceberg table properties."""

    grid_type: str
    grid_resolution: int | None
    boundaries_path: str | None
    id_field: str | None
    _cached_stats: list[dict] | None = field(default=None, repr=False)
    _cached_top_cells: list[dict] | None = field(default=None, repr=False)

    def cells_for_geometry(self, geom) -> list[str]:
        """Return the partition keys that intersect *geom*."""
        if self.grid_type == "h3":
            return self._h3_cells(geom)
        if self.grid_type == "geojson":
            return self._geojson_keys(geom)
        raise ValueError(f"Unknown grid type: {self.grid_type!r}")

    def cell_list_sql(self, geom) -> str:
        """Return a SQL fragment suitable for ``WHERE grid_partition IN (...)``."""
        cells = self.cells_for_geometry(geom)
        if not cells:
            return "grid_partition IN (NULL)"
        quoted = ", ".join(f"'{c}'" for c in cells)
        return f"grid_partition IN ({quoted})"

    def file_paths(
        self,
        table,
        geom,
        start_datetime: str | datetime | None = None,
        end_datetime: str | datetime | None = None,
    ) -> list[str]:
        """Return Parquet file paths for partitions intersecting *geom*."""
        from pyiceberg.expressions import And, GreaterThanOrEqual, In, LessThanOrEqual

        cells = self.cells_for_geometry(geom)
        if not cells:
            return []

        expr = In("grid_partition", cells)
        if start_datetime is not None:
            expr = And(expr, GreaterThanOrEqual("datetime", _parse_dt(start_datetime)))
        if end_datetime is not None:
            expr = And(expr, LessThanOrEqual("datetime", _parse_dt(end_datetime)))

        start_year = _parse_dt(start_datetime).year if start_datetime is not None else None
        end_year = _parse_dt(end_datetime).year if end_datetime is not None else None

        paths = []
        for task in table.scan(row_filter=expr).plan_files():
            year = task.file.partition[1] + 1970
            if start_year is not None and year < start_year:
                continue
            if end_year is not None and year > end_year:
                continue
            paths.append(task.file.file_path)

        return paths

    def _ensure_stats(self, table) -> list[dict]:
        if self._cached_stats is None:
            self._cached_stats = _build_stats_cache(table)
            if self._cached_top_cells is None:
                cell_agg: dict[str, list[int]] = defaultdict(lambda: [0, 0])
                for s in self._cached_stats:
                    cell_agg[s["grid_partition"]][0] += s["row_count"]
                    cell_agg[s["grid_partition"]][1] += s["file_count"]
                self._cached_top_cells = sorted(
                    [
                        {"grid_partition": cell, "row_count": rows, "file_count": files}
                        for cell, (rows, files) in cell_agg.items()
                    ],
                    key=lambda d: d["row_count"],
                    reverse=True,
                )
        return self._cached_stats

    def stats(self, table) -> list[dict]:
        """Per-partition row counts and file sizes from Iceberg manifests."""
        return self._ensure_stats(table)

    def top_cells(self, table, limit: int = 5) -> list[dict]:
        """Top partitions by row count (cached alongside :meth:`stats`)."""
        self._ensure_stats(table)
        return self._cached_top_cells[:limit]  # type: ignore[index]

    def total_files(self, table) -> int:
        """Total Parquet file count from Iceberg snapshot manifests."""
        return sum(s["file_count"] for s in self._ensure_stats(table))

    def unique_item_count(self, table, store, default_hash_index_path: str | None = None) -> int:
        """Row count of the hash-index Parquet file (footer read only)."""
        import pyarrow.parquet as pq

        hash_index_path = table.properties.get("earthcatalog.hash_index_path")
        if hash_index_path is None:
            hash_index_path = default_hash_index_path
        if not hash_index_path:
            return 0

        try:
            if hash_index_path.startswith("s3://"):
                if not store:
                    return 0
                _, _, rest = hash_index_path.partition("s3://")
                obstore_key = rest.split("/", 1)[1] if "/" in rest else ""
                if not obstore_key:
                    return 0
                return _parquet_row_count_from_store(store, obstore_key)

            if not Path(hash_index_path).exists():
                return 0
            return pq.ParquetFile(hash_index_path).metadata.num_rows

        except Exception:
            return 0

    def _h3_cells(self, geom) -> list[str]:
        from shapely import wkb

        from earthcatalog.grids.h3_partitioner import H3Partitioner

        res = self.grid_resolution if self.grid_resolution is not None else 1
        return H3Partitioner(resolution=res).get_intersecting_keys(wkb.dumps(geom))

    def _geojson_keys(self, geom) -> list[str]:
        if not self.boundaries_path:
            raise ValueError(
                "boundaries_path is required for geojson grid type. "
                "Re-ingest with a GridConfig that specifies boundaries_path."
            )
        from shapely import wkb

        from earthcatalog.grids.geojson_partitioner import GeoJSONPartitioner

        return GeoJSONPartitioner(
            boundaries_path=self.boundaries_path,
            id_field=self.id_field or "id",
        ).get_intersecting_keys(wkb.dumps(geom))

    def __repr__(self) -> str:
        if self.grid_type == "h3":
            return f"CatalogInfo(grid_type='h3', resolution={self.grid_resolution})"
        return (
            f"CatalogInfo(grid_type='geojson', "
            f"boundaries_path={self.boundaries_path!r}, id_field={self.id_field!r})"
        )

Functions¶

`cells_for_geometry(geom)` ¶

Return the partition keys that intersect geom.

Source code in earthcatalog/catalog.py

def cells_for_geometry(self, geom) -> list[str]:
    """Return the partition keys that intersect *geom*."""
    if self.grid_type == "h3":
        return self._h3_cells(geom)
    if self.grid_type == "geojson":
        return self._geojson_keys(geom)
    raise ValueError(f"Unknown grid type: {self.grid_type!r}")

`cell_list_sql(geom)` ¶

Return a SQL fragment suitable for WHERE grid_partition IN (...).

Source code in earthcatalog/catalog.py

def cell_list_sql(self, geom) -> str:
    """Return a SQL fragment suitable for ``WHERE grid_partition IN (...)``."""
    cells = self.cells_for_geometry(geom)
    if not cells:
        return "grid_partition IN (NULL)"
    quoted = ", ".join(f"'{c}'" for c in cells)
    return f"grid_partition IN ({quoted})"

`file_paths(table, geom, start_datetime=None, end_datetime=None)` ¶

Return Parquet file paths for partitions intersecting geom.

Source code in earthcatalog/catalog.py

def file_paths(
    self,
    table,
    geom,
    start_datetime: str | datetime | None = None,
    end_datetime: str | datetime | None = None,
) -> list[str]:
    """Return Parquet file paths for partitions intersecting *geom*."""
    from pyiceberg.expressions import And, GreaterThanOrEqual, In, LessThanOrEqual

    cells = self.cells_for_geometry(geom)
    if not cells:
        return []

    expr = In("grid_partition", cells)
    if start_datetime is not None:
        expr = And(expr, GreaterThanOrEqual("datetime", _parse_dt(start_datetime)))
    if end_datetime is not None:
        expr = And(expr, LessThanOrEqual("datetime", _parse_dt(end_datetime)))

    start_year = _parse_dt(start_datetime).year if start_datetime is not None else None
    end_year = _parse_dt(end_datetime).year if end_datetime is not None else None

    paths = []
    for task in table.scan(row_filter=expr).plan_files():
        year = task.file.partition[1] + 1970
        if start_year is not None and year < start_year:
            continue
        if end_year is not None and year > end_year:
            continue
        paths.append(task.file.file_path)

    return paths

`stats(table)` ¶

Per-partition row counts and file sizes from Iceberg manifests.

Source code in earthcatalog/catalog.py

def stats(self, table) -> list[dict]:
    """Per-partition row counts and file sizes from Iceberg manifests."""
    return self._ensure_stats(table)

`top_cells(table, limit=5)` ¶

Top partitions by row count (cached alongside :meth:stats).

Source code in earthcatalog/catalog.py

def top_cells(self, table, limit: int = 5) -> list[dict]:
    """Top partitions by row count (cached alongside :meth:`stats`)."""
    self._ensure_stats(table)
    return self._cached_top_cells[:limit]  # type: ignore[index]

`total_files(table)` ¶

Total Parquet file count from Iceberg snapshot manifests.

Source code in earthcatalog/catalog.py

def total_files(self, table) -> int:
    """Total Parquet file count from Iceberg snapshot manifests."""
    return sum(s["file_count"] for s in self._ensure_stats(table))

`unique_item_count(table, store, default_hash_index_path=None)` ¶

Row count of the hash-index Parquet file (footer read only).

Source code in earthcatalog/catalog.py

def unique_item_count(self, table, store, default_hash_index_path: str | None = None) -> int:
    """Row count of the hash-index Parquet file (footer read only)."""
    import pyarrow.parquet as pq

    hash_index_path = table.properties.get("earthcatalog.hash_index_path")
    if hash_index_path is None:
        hash_index_path = default_hash_index_path
    if not hash_index_path:
        return 0

    try:
        if hash_index_path.startswith("s3://"):
            if not store:
                return 0
            _, _, rest = hash_index_path.partition("s3://")
            obstore_key = rest.split("/", 1)[1] if "/" in rest else ""
            if not obstore_key:
                return 0
            return _parquet_row_count_from_store(store, obstore_key)

        if not Path(hash_index_path).exists():
            return 0
        return pq.ParquetFile(hash_index_path).metadata.num_rows

    except Exception:
        return 0

`EarthCatalog` ¶

Simplified facade for querying an EarthCatalog.

Combines PyIceberg catalog, table, and CatalogInfo into a single interface for spatial/temporal queries with automatic file pruning.

Example::

from earthcatalog import open as ec_open
from obstore.store import S3Store
from shapely.geometry import Point

store = S3Store(bucket='my-bucket', region='us-west-2')
ec = ec_open(store=store, base='s3://my-bucket/catalog')

point = Point(-133.99, 58.74)
paths = ec.search_files(point, start_datetime='2020-01-01')

Source code in earthcatalog/catalog.py

class EarthCatalog:
    """Simplified facade for querying an EarthCatalog.

    Combines PyIceberg catalog, table, and CatalogInfo into a single interface
    for spatial/temporal queries with automatic file pruning.

    Example::

        from earthcatalog import open as ec_open
        from obstore.store import S3Store
        from shapely.geometry import Point

        store = S3Store(bucket='my-bucket', region='us-west-2')
        ec = ec_open(store=store, base='s3://my-bucket/catalog')

        point = Point(-133.99, 58.74)
        paths = ec.search_files(point, start_datetime='2020-01-01')
    """

    def __init__(
        self,
        catalog: object,
        table: Table,
        info: CatalogInfo,
        store: object | None = None,
        *,
        catalog_key: str | None = None,
    ):
        """Initialize an EarthCatalog facade.

        Args:
            catalog: PyIceberg SqlCatalog instance
            table: PyIceberg Table instance
            info: CatalogInfo with grid metadata
            store: obstore Store instance (for reading hash index from S3)
            catalog_key: Key within *store* where catalog.db is persisted.
                         Required for ingest() which needs to upload changes.
        """
        self._catalog = catalog
        self._table = table
        self._info = info
        self._store = store
        self._catalog_key = catalog_key

    def search_files(
        self,
        geom,
        start_datetime: str | datetime | None = None,
        end_datetime: str | datetime | None = None,
    ) -> list[str]:
        """Return Parquet file paths for partitions intersecting *geom*."""
        return self._info.file_paths(
            self._table,
            geom,
            start_datetime=start_datetime,
            end_datetime=end_datetime,
        )

    def search(self, **kwargs):
        """Search across the catalog, returning a deferred ``EarthCatalogItemSearch``.

        Accepts the same kwargs as :func:`rustac.search`:
        ``intersects``, ``bbox``, ``datetime``, ``filter`` (CQL2 JSON),
        ``ids``, ``collections``, ``max_items``, ``limit``, ``sortby``,
        ``include``, ``exclude``, ``query``, etc.

        Use the top-level ``datetime`` kwarg for temporal filtering.  Do
        **not** reference ``datetime`` inside the CQL2 ``filter`` —
        rustac generates broken SQL when ``datetime`` appears in a CQL2
        expression.

        Performance
        -----------
        For fastest results use :meth:`duck_search` with ``format="native"``
        (DuckDB parallel I/O, ~2× faster across all query types).
        ``search()`` and ``search_to_arrow()`` use rustac (sequential per-file)
        and have comparable speed.  See :doc:`/operations/search_performance`
        for detailed benchmarks.

        Returns
        -------
        EarthCatalogItemSearch
            A lazy, pystac_client-compatible search result.  No I/O until
            ``items()``, ``item_collection()``, or ``pages()`` is called.
        """
        from .search import EarthCatalogItemSearch, _FileSearchEngine

        engine = _FileSearchEngine(prune_fn=self._search_prune)
        return EarthCatalogItemSearch(
            params=kwargs,
            engine=engine,
            table=self._table,
            anonymous_ctx=self._cleared_env_s3,
        )

    def search_to_arrow(self, **kwargs):
        """Search across the catalog, returning a PyArrow table."""
        from .search import _FileSearchEngine

        engine = _FileSearchEngine(prune_fn=self._search_prune)
        with self._cleared_env_s3():
            return engine.search_to_arrow(**kwargs)

    def search_uris(self, **kwargs):
        """Return asset URIs as a DataFrame with ``(id, uri)`` columns.

        Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
        ``datetime``, ``filter``, ``max_items``, etc.).

        Uses ``search_files()`` + DuckDB internally, reading **only** the
        ``id`` and ``assets`` columns from S3 — fastest way to get download
        URLs for thousands of items.  Returns a ``pandas.DataFrame``.

        Examples::

            import cql2
            df = catalog.search_uris(
                intersects={"type": "Point", "coordinates": [-45, 70]},
                datetime="2020-01-01/2020-12-31",
                filter=cql2.parse_text("percent_valid_pixels >= 80").to_json(),
                max_items=100,
            )
            # df has columns: id, uri
            for _, row in df.iterrows():
                print(row.id, row.uri)
        """
        import json

        import duckdb
        from shapely.geometry import shape

        from .search import _extract_datetime_range

        # --- geometry ---
        geom = None
        if "intersects" in kwargs:
            geom = shape(kwargs["intersects"])
        elif "bbox" in kwargs:
            from shapely.geometry import box

            b = kwargs["bbox"]
            geom = box(b[0], b[1], b[2], b[3])

        # --- Iceberg pruning ---
        start_dt, end_dt = _extract_datetime_range(**kwargs)
        paths = self._info.file_paths(
            self._table,
            geom,
            start_datetime=start_dt,
            end_datetime=end_dt,
        )
        if not paths:
            import pandas as pd

            return pd.DataFrame({"id": [], "uri": []})

        # --- build SQL (read only id + assets) ---
        path_list = ", ".join(repr(p) for p in paths)
        conditions: list[str] = []
        if geom is not None:
            conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
        if start_dt is not None:
            conditions.append(f"datetime >= '{start_dt}'")
        if end_dt is not None:
            conditions.append(f"datetime <= '{end_dt}'")
        raw_filter = kwargs.get("filter")
        if raw_filter is not None:
            from .search import _cql2_to_sql

            conditions.append(_cql2_to_sql(raw_filter))
        where = " AND ".join(conditions) if conditions else "TRUE"
        max_items = kwargs.get("max_items")

        sql = f"""SELECT id, assets FROM read_parquet([{path_list}]) WHERE {where}"""

        # --- execute (Arrow → list is faster than pandas iterrows) ---
        con = duckdb.connect()
        con.execute("INSTALL spatial; LOAD spatial;")
        con.execute("SET s3_access_key_id='';")
        con.execute("SET s3_secret_access_key='';")
        con.execute("SET s3_session_token='';")
        table = con.execute(sql).to_arrow_table()
        if max_items is not None and table.num_rows > max_items:
            table = table.slice(0, max_items)

        # --- extract data URIs from JSON assets ---

        ids = table.column("id").to_pylist()
        assets_list = table.column("assets").to_pylist()
        uris = []
        for a in assets_list:
            href = None
            if a:
                try:
                    href = json.loads(a).get("data", {}).get("href")
                except (json.JSONDecodeError, AttributeError):
                    pass
            uris.append(href)

        import pandas as pd

        return pd.DataFrame({"id": ids, "uri": uris})

    def duck_search(self, **kwargs):
        """Search using DuckDB, returning results as a ``pandas.DataFrame``.

        Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
        ``datetime``, ``filter``, ``max_items``, etc.).

        DuckDB reads Parquet files in parallel internally, making this
        **~2× faster** than :meth:`search` across all query types.
        Returns a DataFrame with flat columns — no pystac conversion
        overhead.  For pystac Items use :meth:`search` (lazy iteration).

        Examples::

            df = catalog.duck_search(
                intersects={"type": "Point", "coordinates": [-45, 70]},
                datetime="1980-01-01/2015-12-31",
                max_items=100,
            )
            # df is a pandas.DataFrame
            print(df.columns.tolist())
        """
        import duckdb
        from shapely.geometry import shape

        from .search import _cql2_to_sql, _extract_datetime_range

        geom = None
        if "intersects" in kwargs:
            geom = shape(kwargs["intersects"])
        elif "bbox" in kwargs:
            from shapely.geometry import box

            b = kwargs["bbox"]
            geom = box(b[0], b[1], b[2], b[3])

        start_dt, end_dt = _extract_datetime_range(**kwargs)
        paths = self._info.file_paths(
            self._table, geom, start_datetime=start_dt, end_datetime=end_dt
        )
        if not paths:
            import pandas as pd

            return pd.DataFrame()

        path_list = ", ".join(repr(p) for p in paths)
        conditions: list[str] = []
        if geom is not None:
            conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
        if start_dt is not None:
            conditions.append(f"datetime >= '{start_dt}'")
        if end_dt is not None:
            conditions.append(f"datetime <= '{end_dt}'")
        raw_filter = kwargs.get("filter")
        if raw_filter is not None:
            conditions.append(_cql2_to_sql(raw_filter))

        where = " AND ".join(conditions) if conditions else "TRUE"
        max_items = kwargs.get("max_items")
        # LIMIT omitted — triggers 7× slower plan for multi-file reads
        sql = f"SELECT * FROM read_parquet([{path_list}]) WHERE {where}"

        con = duckdb.connect()
        con.execute("INSTALL spatial; LOAD spatial;")
        con.execute("SET s3_access_key_id='';")
        con.execute("SET s3_secret_access_key='';")
        con.execute("SET s3_session_token='';")
        df = con.execute(sql).fetchdf()
        if max_items is not None and len(df) > max_items:
            df = df.head(max_items)
        return df

    def _search_prune(self, geom, start_datetime=None, end_datetime=None):
        """Prune warehouse files via Iceberg partition metadata (zero I/O)."""
        return self._info.file_paths(
            self._table, geom, start_datetime=start_datetime, end_datetime=end_datetime
        )

    def _cleared_env_s3(self):
        """Context manager: clear AWS cred env vars so rustac/DuckDB use unsigned requests.

        rustac and DuckDB read ``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` from the
        environment rather than using the obstore store's auth.  When the store was created
        as anonymous (``skip_signature``) or the environment has no credentials, this
        context manager temporarily removes them and sets ``AWS_NO_SIGN_REQUEST=yes``.
        """
        import os
        from contextlib import contextmanager

        anonymous = not os.environ.get("AWS_ACCESS_KEY_ID")
        if not anonymous and self._store is not None and hasattr(self._store, "config"):
            anonymous = self._store.config.get("skip_signature") in (True, "true")

        @contextmanager
        def _ctx():
            if not anonymous:
                yield
                return
            saved = {
                "AWS_ACCESS_KEY_ID": os.environ.pop("AWS_ACCESS_KEY_ID", None),
                "AWS_SECRET_ACCESS_KEY": os.environ.pop("AWS_SECRET_ACCESS_KEY", None),
                "AWS_SESSION_TOKEN": os.environ.pop("AWS_SESSION_TOKEN", None),
            }
            os.environ["AWS_NO_SIGN_REQUEST"] = "yes"
            try:
                yield
            finally:
                os.environ.pop("AWS_NO_SIGN_REQUEST", None)
                for k, v in saved.items():
                    if v is not None:
                        os.environ[k] = v

        return _ctx()

    def stats(self) -> list[dict]:
        """Return per-partition row counts and file sizes from Iceberg metadata."""
        return self._info.stats(self._table)

    def unique_item_count(self) -> int:
        """Return the count of unique STAC items from the hash index."""
        default_hash_index_path = None
        if self._catalog is not None:
            warehouse = self._catalog.properties.get("warehouse", "")
            if warehouse:
                default_hash_index_path = warehouse.rstrip("/") + "_id_hashes.parquet"

        return self._info.unique_item_count(self._table, self._store, default_hash_index_path)

    def info(self) -> CatalogInfo:
        """Return the grid metadata and catalog statistics object."""
        return self._info

    def ingest(
        self,
        inventory_path: str,
        *,
        mode: str = "auto",
        chunk_size: int = 10000,
        limit: int | None = None,
        since: datetime | None = None,
        update_hash_index: bool = False,
    ) -> dict:
        """Ingest STAC items from an S3 Inventory into the catalog.

        Unified entry point replacing both ``backfill.run_backfill`` and
        ``incremental.run``.  Handles full backfill (drop+recreate table)
        and delta append (add files to existing table).

        The caller is responsible for holding an S3Lock around this call
        when running against a shared store (use ``self.lock()``).
        """
        import os
        import uuid
        from concurrent.futures import ThreadPoolExecutor

        from earthcatalog.grids import build_partitioner
        from earthcatalog.pipelines.incremental import _fetch_item, _iter_inventory

        from .hash_index import (
            merge_hashes_from_parquets,
            read_hashes,
            write_hashes,
        )
        from .transform import (
            fan_out,
            group_by_partition,
            write_geoparquet_s3,
        )

        if not os.environ.get("AWS_ACCESS_KEY_ID"):
            raise RuntimeError(
                "No AWS credentials found in environment. "
                "ingest() requires write access to S3. "
                "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
            )

        if mode == "auto":
            try:
                n = sum(s["row_count"] for s in self._info.stats(self._table))
                mode = "delta" if n > 0 else "full"
            except Exception:
                mode = "full"

        is_delta = mode == "delta"

        from earthcatalog.config import GridConfig

        grid_cfg = GridConfig(
            type=self._info.grid_type,
            resolution=self._info.grid_resolution,
            boundaries_path=self._info.boundaries_path,
            id_field=self._info.id_field,
        )
        partitioner = build_partitioner(grid_cfg)

        warehouse_root = self._catalog.properties.get("warehouse", "")
        uri = self._catalog.properties.get("uri", "")
        local_db = uri.removeprefix("sqlite:///") if uri else "/tmp/earthcatalog.db"

        if self._store and self._catalog_key:
            self.download_catalog(local_db)

        if not is_delta:
            from pyiceberg.exceptions import NoSuchTableError

            try:
                self._catalog.drop_table(FULL_NAME)
            except NoSuchTableError:
                pass
            try:
                self._catalog.create_namespace(NAMESPACE)
            except Exception:
                pass
            self._table = get_or_create(self._catalog, grid_config=grid_cfg)

        total_items = 0
        total_rows = 0
        written_keys: list[str] = []
        batch: list[tuple[str, str]] = []

        def _flush(chunk: list[tuple[str, str]]) -> None:
            nonlocal total_rows

            with ThreadPoolExecutor(max_workers=16) as pool:
                items = list(filter(None, pool.map(lambda bc: _fetch_item(*bc), chunk)))

            if not items:
                return

            fo = fan_out(items, partitioner)
            if not fo:
                return

            for (cell, year), group_items in group_by_partition(fo).items():
                year_str = str(year) if year is not None else "unknown"
                part_tag = uuid.uuid4().hex[:8]
                s3_key = f"grid_partition={cell}/year={year_str}/part_{part_tag}.parquet"
                n, _ = write_geoparquet_s3(group_items, self._store, s3_key)
                if n > 0:
                    written_keys.append(s3_key)
                    total_rows += n

        print(f"Ingesting from: {inventory_path}")
        for bucket, key in _iter_inventory(inventory_path, since=since):
            if not key.endswith(".stac.json"):
                continue
            batch.append((bucket, key))
            total_items += 1
            if len(batch) >= chunk_size:
                _flush(batch)
                batch.clear()
            if limit and total_items >= limit:
                break

        if batch:
            _flush(batch)

        if written_keys:
            full_paths = [f"{warehouse_root.rstrip('/')}/{k}" for k in written_keys]
            batch_sz = 2000
            for i in range(0, len(full_paths), batch_sz):
                self._table.add_files(full_paths[i : i + batch_sz])
            print(f"Registered {len(full_paths)} files in Iceberg catalog.")

        if update_hash_index and written_keys:
            hash_index_path = self._table.properties.get("earthcatalog.hash_index_path")
            if not hash_index_path:
                hash_index_path = f"{warehouse_root.rstrip('/')}_id_hashes.parquet"
                with self._table.transaction() as tx:
                    tx.set_properties(**{"earthcatalog.hash_index_path": hash_index_path})

            if hash_index_path.startswith("s3://"):
                import re as _re

                m = _re.match(r"s3://([^/]+)/(.+)", hash_index_path)
                if m:
                    hash_key = m.group(2)
                    existing = read_hashes(self._store, hash_key)
                    print(f"  Existing hashes: {len(existing):,}")
                    updated, n_new = merge_hashes_from_parquets(
                        full_paths, existing, store=self._store
                    )
                    print(f"  New hashes: {n_new:,} from {len(full_paths)} files")
                    write_hashes(updated, self._store, hash_key)
            else:
                print("WARN: hash index update skipped — only s3:// paths supported")

        if self._store and self._catalog_key:
            self.upload_catalog(local_db)

        result = {
            "items_processed": total_items,
            "rows_written": total_rows,
            "files_registered": len(written_keys),
        }
        print(f"Done. {total_items} items -> {total_rows} rows in {len(written_keys)} files")
        return result

    def bulk_ingest(
        self,
        inventory_path: str,
        *,
        mode: str = "auto",
        chunk_size: int = 100_000,
        compact_rows: int = 100_000,
        limit: int | None = None,
        since: datetime | None = None,
        update_hash_index: bool = False,
        staging_prefix: str | None = None,
        create_client: Callable[[], object] | None = None,
        skip_inventory: bool = False,
        skip_ingest: bool = False,
        retry_pending: bool = False,
    ) -> None:
        """Ingest large inventories using a distributed Dask cluster."""
        import os
        from datetime import UTC
        from datetime import datetime as _dt

        from earthcatalog.config import GridConfig
        from earthcatalog.grids import build_partitioner
        from earthcatalog.pipelines.backfill import run_backfill

        if not os.environ.get("AWS_ACCESS_KEY_ID"):
            raise RuntimeError(
                "No AWS credentials found in environment. "
                "bulk_ingest() requires write access to S3. "
                "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
            )

        warehouse_root = self._catalog.properties.get("warehouse", "")
        uri = self._catalog.properties.get("uri", "")
        local_db = uri.removeprefix("sqlite:///")

        grid_cfg = GridConfig(
            type=self._info.grid_type,
            resolution=self._info.grid_resolution,
            boundaries_path=self._info.boundaries_path,
            id_field=self._info.id_field,
        )
        partitioner = build_partitioner(grid_cfg)

        if staging_prefix is None:
            date_str = _dt.now(UTC).strftime("%Y%m%d")
            staging_prefix = f"bulk_ingest/{date_str}"

        delta = True
        if mode == "full":
            delta = False
        elif mode == "auto":
            try:
                n = sum(s["row_count"] for s in self._info.stats(self._table))
                delta = n > 0
            except Exception:
                delta = False

        if self._store and self._catalog_key:
            self.download_catalog(local_db)

        from . import store_config

        old_store = store_config.get_store()
        old_key = store_config.get_catalog_key()
        try:
            store_config.set_store(self._store)
            if self._catalog_key:
                store_config.set_catalog_key(self._catalog_key)

            run_backfill(
                inventory_path=inventory_path,
                catalog_path=local_db,
                staging_store=self._store,
                staging_prefix=staging_prefix,
                warehouse_store=self._store,
                warehouse_root=warehouse_root,
                partitioner=partitioner,
                chunk_size=chunk_size,
                compact_rows=compact_rows,
                limit=limit,
                since=since,
                use_lock=False,
                upload=True,
                skip_inventory=skip_inventory,
                skip_ingest=skip_ingest,
                retry_pending=retry_pending,
                delta=delta,
                create_client=create_client,
                update_hash_index=update_hash_index,
                hash_index_path=self._table.properties.get("earthcatalog.hash_index_path"),
            )
        finally:
            store_config.set_store(old_store)
            store_config.set_catalog_key(old_key)

    def download_catalog(self, local_path: str) -> None:
        """Download catalog.db from the backing store to *local_path*."""
        download_catalog(local_path, store=self._store)

    def upload_catalog(self, local_path: str) -> None:
        """Upload catalog.db from *local_path* to the backing store."""
        upload_catalog(local_path, store=self._store)

    def compact(
        self,
        threshold: int = 2,
        dry_run: bool = False,
    ) -> dict[str, int]:
        """Compact over-threshold partition buckets and rebuild the Iceberg catalog.

        Wraps :func:`earthcatalog.maintenance.compact.compact_warehouse` using this
        catalog's warehouse path and local catalog database.

        Parameters
        ----------
        threshold:
            Minimum number of part files in a bucket before it is compacted.
            Default: 2 (compact any bucket with more than one part file).
        dry_run:
            When ``True``, report what *would* be compacted but make no changes.

        Returns
        -------
        Summary dict with keys ``buckets_scanned``, ``buckets_compacted``,
        ``files_before``, ``files_after``.
        """
        from earthcatalog.maintenance.compact import compact_warehouse

        warehouse_path = self._catalog.properties.get("warehouse", "")
        uri = self._catalog.properties.get("uri", "")
        local_db = uri.removeprefix("sqlite:///")
        return compact_warehouse(
            warehouse_path=warehouse_path,
            catalog_path=local_db,
            threshold=threshold,
            dry_run=dry_run,
        )

    def lock(self, owner: str, ttl_hours: int = 12):
        """Return an S3Lock that uses this EarthCatalog's store and key."""
        from .lock import S3Lock

        lock_key = getattr(self._catalog, "_lock_key", None) or ".lock"
        return S3Lock(owner=owner, ttl_hours=ttl_hours, store=self._store, key=lock_key)

    def cells_for_geometry(self, geom) -> list[str]:
        """Return the partition keys that intersect *geom*."""
        return self._info.cells_for_geometry(geom)

    def cell_list_sql(self, geom) -> str:
        """Return a SQL fragment suitable for ``WHERE grid_partition IN (...)``."""
        return self._info.cell_list_sql(geom)

    @property
    def grid_type(self) -> str:
        """Return the grid partitioning system type."""
        return self._info.grid_type

    @property
    def grid_resolution(self) -> int | None:
        """Return the H3/S2 resolution (None for GeoJSON grids)."""
        return self._info.grid_resolution

    @property
    def table(self):
        """Return the underlying PyIceberg Table (for advanced use)."""
        return self._table

    def _repr_html_(self) -> str:
        """Return an HTML representation for Jupyter notebooks.

        Single-column layout with metadata table and collapsible top partitions.
        Reads only Iceberg manifests — no Parquet data is scanned.
        """
        rows = [("Grid type", self._info.grid_type)]

        if self._info.grid_type == "h3":
            rows.append(("H3 resolution", str(self._info.grid_resolution)))
        else:
            rows.append(("Boundaries", self._info.boundaries_path or "N/A"))

        warehouse_path = self._catalog.properties.get("warehouse", "") if self._catalog else ""
        if warehouse_path:
            rows.append(("Warehouse", warehouse_path))

        hash_idx = self._table.properties.get("earthcatalog.hash_index_path")
        rows.append(("Hash index", "Available" if hash_idx else "Not available"))

        table_html = "<table style='border-collapse: collapse; width: 100%; margin: 0;'>"
        for label, value in rows:
            table_html += f"""
                <tr style='border-bottom: 1px solid currentColor;'>
                    <td style='padding: 6px 10px; border: none; width: 180px;'>{label}</td>
                    <td style='padding: 6px 10px; border: none;'><strong>{value}</strong></td>
                </tr>"""
        table_html += "</table>"

        stats = self._info.stats(self._table)
        bottom_html = ""
        if stats:
            total_files = self._info.total_files(self._table)
            total_rows = sum(s["row_count"] for s in stats)
            warehouse = self._catalog.properties.get("warehouse", "") if self._catalog else ""
            default_hi = warehouse.rstrip("/") + "_id_hashes.parquet" if warehouse else None
            unique = self._info.unique_item_count(self._table, self._store, default_hi)

            stat_rows = [
                ("Total files", f"{total_files:,}"),
                ("Total rows", f"{total_rows:,}"),
                ("Unique items", f"{unique:,}"),
                ("Partitions", f"{len(stats):,}"),
            ]
            stats_table = "<table style='border-collapse: collapse; width: 100%; font-size: 13px; margin: 0;'>"
            for label, value in stat_rows:
                stats_table += f"""
                    <tr style='border-bottom: 1px solid currentColor;'>
                        <td style='padding: 4px 6px; border: none; width: 180px;'>{label}</td>
                        <td style='padding: 4px 6px; border: none;'><strong>{value}</strong></td>
                    </tr>"""
            stats_table += "</table>"

            top_cells = self._info.top_cells(self._table, limit=3)
            top_html = ""
            if top_cells:
                top_rows = ""
                for cell in top_cells:
                    top_rows += f"""
                        <tr style='border-bottom: 1px solid currentColor;'>
                            <td style='padding: 4px 6px; border: none; width: 180px; font-family: monospace;'>{cell["grid_partition"][:12]}...</td>
                            <td style='padding: 4px 6px; border: none;'>{cell["row_count"]:,} rows</td>
                        </tr>"""
                top_html = f"""
                <details style='margin-top: 12px;'>
                    <summary style='font-weight: 600; cursor: pointer;'>Top partitions</summary>
                    <table style='border-collapse: collapse; width: 100%; font-size: 13px; margin: 8px 0 0 0;'>{top_rows}</table>
                </details>"""

            bottom_html = f"""
            <div style='font-weight: 600; margin-top: 12px;'>Statistics</div>
            {stats_table}
            {top_html}"""

        return f"""
        <div         style='border: 1px solid currentColor; padding: 15px; border-radius: 5px; font-family: var(--jp-code-font-family, monospace); opacity: 0.9; text-align: left; max-width: 800px;'>
            <div style='font-size: 16px; font-weight: 600; margin-bottom: 12px; display: flex; align-items: center; gap: 8px;'>
                <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round">
                    <circle cx="12" cy="12" r="10"/>
                    <path d="M2 12h20M12 2c-3.3 2-5.5 5.5-5.5 10s2.2 8 5.5 10c3.3-2 5.5-5.5 5.5-10S15.3 4 12 2z"/>
                </svg>
                <span>EarthCatalog</span>
            </div>
            {table_html}
            {bottom_html}
        </div>"""

    def __repr__(self) -> str:
        if self._info.grid_type == "h3":
            return f"EarthCatalog(grid_type='h3', resolution={self._info.grid_resolution})"
        return (
            f"EarthCatalog(grid_type='geojson', "
            f"boundaries_path={self._info.boundaries_path!r}, "
            f"id_field={self._info.id_field!r})"
        )

Attributes¶

`grid_type` `property` ¶

Return the grid partitioning system type.

`grid_resolution` `property` ¶

Return the H3/S2 resolution (None for GeoJSON grids).

`table` `property` ¶

Return the underlying PyIceberg Table (for advanced use).

Functions¶

`search_files(geom, start_datetime=None, end_datetime=None)` ¶

Return Parquet file paths for partitions intersecting geom.

Source code in earthcatalog/catalog.py

def search_files(
    self,
    geom,
    start_datetime: str | datetime | None = None,
    end_datetime: str | datetime | None = None,
) -> list[str]:
    """Return Parquet file paths for partitions intersecting *geom*."""
    return self._info.file_paths(
        self._table,
        geom,
        start_datetime=start_datetime,
        end_datetime=end_datetime,
    )

`search(**kwargs)` ¶

Search across the catalog, returning a deferred EarthCatalogItemSearch.

Accepts the same kwargs as :func:rustac.search: intersects, bbox, datetime, filter (CQL2 JSON), ids, collections, max_items, limit, sortby, include, exclude, query, etc.

Use the top-level datetime kwarg for temporal filtering. Do not reference datetime inside the CQL2 filter — rustac generates broken SQL when datetime appears in a CQL2 expression.

Performance¶

For fastest results use :meth:duck_search with format="native" (DuckDB parallel I/O, ~2× faster across all query types). search() and search_to_arrow() use rustac (sequential per-file) and have comparable speed. See :doc:/operations/search_performance for detailed benchmarks.

Returns¶

EarthCatalogItemSearch A lazy, pystac_client-compatible search result. No I/O until items(), item_collection(), or pages() is called.

Source code in earthcatalog/catalog.py

def search(self, **kwargs):
    """Search across the catalog, returning a deferred ``EarthCatalogItemSearch``.

    Accepts the same kwargs as :func:`rustac.search`:
    ``intersects``, ``bbox``, ``datetime``, ``filter`` (CQL2 JSON),
    ``ids``, ``collections``, ``max_items``, ``limit``, ``sortby``,
    ``include``, ``exclude``, ``query``, etc.

    Use the top-level ``datetime`` kwarg for temporal filtering.  Do
    **not** reference ``datetime`` inside the CQL2 ``filter`` —
    rustac generates broken SQL when ``datetime`` appears in a CQL2
    expression.

    Performance
    -----------
    For fastest results use :meth:`duck_search` with ``format="native"``
    (DuckDB parallel I/O, ~2× faster across all query types).
    ``search()`` and ``search_to_arrow()`` use rustac (sequential per-file)
    and have comparable speed.  See :doc:`/operations/search_performance`
    for detailed benchmarks.

    Returns
    -------
    EarthCatalogItemSearch
        A lazy, pystac_client-compatible search result.  No I/O until
        ``items()``, ``item_collection()``, or ``pages()`` is called.
    """
    from .search import EarthCatalogItemSearch, _FileSearchEngine

    engine = _FileSearchEngine(prune_fn=self._search_prune)
    return EarthCatalogItemSearch(
        params=kwargs,
        engine=engine,
        table=self._table,
        anonymous_ctx=self._cleared_env_s3,
    )

`search_to_arrow(**kwargs)` ¶

Search across the catalog, returning a PyArrow table.

Source code in earthcatalog/catalog.py

def search_to_arrow(self, **kwargs):
    """Search across the catalog, returning a PyArrow table."""
    from .search import _FileSearchEngine

    engine = _FileSearchEngine(prune_fn=self._search_prune)
    with self._cleared_env_s3():
        return engine.search_to_arrow(**kwargs)

`search_uris(**kwargs)` ¶

Return asset URIs as a DataFrame with (id, uri) columns.

Accepts the same kwargs as :meth:search (intersects, bbox, datetime, filter, max_items, etc.).

Uses search_files() + DuckDB internally, reading only the id and assets columns from S3 — fastest way to get download URLs for thousands of items. Returns a pandas.DataFrame.

Examples::

import cql2
df = catalog.search_uris(
    intersects={"type": "Point", "coordinates": [-45, 70]},
    datetime="2020-01-01/2020-12-31",
    filter=cql2.parse_text("percent_valid_pixels >= 80").to_json(),
    max_items=100,
)
# df has columns: id, uri
for _, row in df.iterrows():
    print(row.id, row.uri)

Source code in earthcatalog/catalog.py

def search_uris(self, **kwargs):
    """Return asset URIs as a DataFrame with ``(id, uri)`` columns.

    Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
    ``datetime``, ``filter``, ``max_items``, etc.).

    Uses ``search_files()`` + DuckDB internally, reading **only** the
    ``id`` and ``assets`` columns from S3 — fastest way to get download
    URLs for thousands of items.  Returns a ``pandas.DataFrame``.

    Examples::

        import cql2
        df = catalog.search_uris(
            intersects={"type": "Point", "coordinates": [-45, 70]},
            datetime="2020-01-01/2020-12-31",
            filter=cql2.parse_text("percent_valid_pixels >= 80").to_json(),
            max_items=100,
        )
        # df has columns: id, uri
        for _, row in df.iterrows():
            print(row.id, row.uri)
    """
    import json

    import duckdb
    from shapely.geometry import shape

    from .search import _extract_datetime_range

    # --- geometry ---
    geom = None
    if "intersects" in kwargs:
        geom = shape(kwargs["intersects"])
    elif "bbox" in kwargs:
        from shapely.geometry import box

        b = kwargs["bbox"]
        geom = box(b[0], b[1], b[2], b[3])

    # --- Iceberg pruning ---
    start_dt, end_dt = _extract_datetime_range(**kwargs)
    paths = self._info.file_paths(
        self._table,
        geom,
        start_datetime=start_dt,
        end_datetime=end_dt,
    )
    if not paths:
        import pandas as pd

        return pd.DataFrame({"id": [], "uri": []})

    # --- build SQL (read only id + assets) ---
    path_list = ", ".join(repr(p) for p in paths)
    conditions: list[str] = []
    if geom is not None:
        conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
    if start_dt is not None:
        conditions.append(f"datetime >= '{start_dt}'")
    if end_dt is not None:
        conditions.append(f"datetime <= '{end_dt}'")
    raw_filter = kwargs.get("filter")
    if raw_filter is not None:
        from .search import _cql2_to_sql

        conditions.append(_cql2_to_sql(raw_filter))
    where = " AND ".join(conditions) if conditions else "TRUE"
    max_items = kwargs.get("max_items")

    sql = f"""SELECT id, assets FROM read_parquet([{path_list}]) WHERE {where}"""

    # --- execute (Arrow → list is faster than pandas iterrows) ---
    con = duckdb.connect()
    con.execute("INSTALL spatial; LOAD spatial;")
    con.execute("SET s3_access_key_id='';")
    con.execute("SET s3_secret_access_key='';")
    con.execute("SET s3_session_token='';")
    table = con.execute(sql).to_arrow_table()
    if max_items is not None and table.num_rows > max_items:
        table = table.slice(0, max_items)

    # --- extract data URIs from JSON assets ---

    ids = table.column("id").to_pylist()
    assets_list = table.column("assets").to_pylist()
    uris = []
    for a in assets_list:
        href = None
        if a:
            try:
                href = json.loads(a).get("data", {}).get("href")
            except (json.JSONDecodeError, AttributeError):
                pass
        uris.append(href)

    import pandas as pd

    return pd.DataFrame({"id": ids, "uri": uris})

`duck_search(**kwargs)` ¶

Search using DuckDB, returning results as a pandas.DataFrame.

Accepts the same kwargs as :meth:search (intersects, bbox, datetime, filter, max_items, etc.).

DuckDB reads Parquet files in parallel internally, making this ~2× faster than :meth:search across all query types. Returns a DataFrame with flat columns — no pystac conversion overhead. For pystac Items use :meth:search (lazy iteration).

Examples::

df = catalog.duck_search(
    intersects={"type": "Point", "coordinates": [-45, 70]},
    datetime="1980-01-01/2015-12-31",
    max_items=100,
)
# df is a pandas.DataFrame
print(df.columns.tolist())

Source code in earthcatalog/catalog.py

def duck_search(self, **kwargs):
    """Search using DuckDB, returning results as a ``pandas.DataFrame``.

    Accepts the same kwargs as :meth:`search` (``intersects``, ``bbox``,
    ``datetime``, ``filter``, ``max_items``, etc.).

    DuckDB reads Parquet files in parallel internally, making this
    **~2× faster** than :meth:`search` across all query types.
    Returns a DataFrame with flat columns — no pystac conversion
    overhead.  For pystac Items use :meth:`search` (lazy iteration).

    Examples::

        df = catalog.duck_search(
            intersects={"type": "Point", "coordinates": [-45, 70]},
            datetime="1980-01-01/2015-12-31",
            max_items=100,
        )
        # df is a pandas.DataFrame
        print(df.columns.tolist())
    """
    import duckdb
    from shapely.geometry import shape

    from .search import _cql2_to_sql, _extract_datetime_range

    geom = None
    if "intersects" in kwargs:
        geom = shape(kwargs["intersects"])
    elif "bbox" in kwargs:
        from shapely.geometry import box

        b = kwargs["bbox"]
        geom = box(b[0], b[1], b[2], b[3])

    start_dt, end_dt = _extract_datetime_range(**kwargs)
    paths = self._info.file_paths(
        self._table, geom, start_datetime=start_dt, end_datetime=end_dt
    )
    if not paths:
        import pandas as pd

        return pd.DataFrame()

    path_list = ", ".join(repr(p) for p in paths)
    conditions: list[str] = []
    if geom is not None:
        conditions.append(f"ST_Intersects(geometry, ST_GeomFromText('{geom.wkt}'))")
    if start_dt is not None:
        conditions.append(f"datetime >= '{start_dt}'")
    if end_dt is not None:
        conditions.append(f"datetime <= '{end_dt}'")
    raw_filter = kwargs.get("filter")
    if raw_filter is not None:
        conditions.append(_cql2_to_sql(raw_filter))

    where = " AND ".join(conditions) if conditions else "TRUE"
    max_items = kwargs.get("max_items")
    # LIMIT omitted — triggers 7× slower plan for multi-file reads
    sql = f"SELECT * FROM read_parquet([{path_list}]) WHERE {where}"

    con = duckdb.connect()
    con.execute("INSTALL spatial; LOAD spatial;")
    con.execute("SET s3_access_key_id='';")
    con.execute("SET s3_secret_access_key='';")
    con.execute("SET s3_session_token='';")
    df = con.execute(sql).fetchdf()
    if max_items is not None and len(df) > max_items:
        df = df.head(max_items)
    return df

`stats()` ¶

Return per-partition row counts and file sizes from Iceberg metadata.

Source code in earthcatalog/catalog.py

def stats(self) -> list[dict]:
    """Return per-partition row counts and file sizes from Iceberg metadata."""
    return self._info.stats(self._table)

`unique_item_count()` ¶

Return the count of unique STAC items from the hash index.

Source code in earthcatalog/catalog.py

def unique_item_count(self) -> int:
    """Return the count of unique STAC items from the hash index."""
    default_hash_index_path = None
    if self._catalog is not None:
        warehouse = self._catalog.properties.get("warehouse", "")
        if warehouse:
            default_hash_index_path = warehouse.rstrip("/") + "_id_hashes.parquet"

    return self._info.unique_item_count(self._table, self._store, default_hash_index_path)

`info()` ¶

Return the grid metadata and catalog statistics object.

Source code in earthcatalog/catalog.py

def info(self) -> CatalogInfo:
    """Return the grid metadata and catalog statistics object."""
    return self._info

`ingest(inventory_path, *, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False)` ¶

Ingest STAC items from an S3 Inventory into the catalog.

Unified entry point replacing both backfill.run_backfill and incremental.run. Handles full backfill (drop+recreate table) and delta append (add files to existing table).

The caller is responsible for holding an S3Lock around this call when running against a shared store (use self.lock()).

Source code in earthcatalog/catalog.py

def ingest(
    self,
    inventory_path: str,
    *,
    mode: str = "auto",
    chunk_size: int = 10000,
    limit: int | None = None,
    since: datetime | None = None,
    update_hash_index: bool = False,
) -> dict:
    """Ingest STAC items from an S3 Inventory into the catalog.

    Unified entry point replacing both ``backfill.run_backfill`` and
    ``incremental.run``.  Handles full backfill (drop+recreate table)
    and delta append (add files to existing table).

    The caller is responsible for holding an S3Lock around this call
    when running against a shared store (use ``self.lock()``).
    """
    import os
    import uuid
    from concurrent.futures import ThreadPoolExecutor

    from earthcatalog.grids import build_partitioner
    from earthcatalog.pipelines.incremental import _fetch_item, _iter_inventory

    from .hash_index import (
        merge_hashes_from_parquets,
        read_hashes,
        write_hashes,
    )
    from .transform import (
        fan_out,
        group_by_partition,
        write_geoparquet_s3,
    )

    if not os.environ.get("AWS_ACCESS_KEY_ID"):
        raise RuntimeError(
            "No AWS credentials found in environment. "
            "ingest() requires write access to S3. "
            "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
        )

    if mode == "auto":
        try:
            n = sum(s["row_count"] for s in self._info.stats(self._table))
            mode = "delta" if n > 0 else "full"
        except Exception:
            mode = "full"

    is_delta = mode == "delta"

    from earthcatalog.config import GridConfig

    grid_cfg = GridConfig(
        type=self._info.grid_type,
        resolution=self._info.grid_resolution,
        boundaries_path=self._info.boundaries_path,
        id_field=self._info.id_field,
    )
    partitioner = build_partitioner(grid_cfg)

    warehouse_root = self._catalog.properties.get("warehouse", "")
    uri = self._catalog.properties.get("uri", "")
    local_db = uri.removeprefix("sqlite:///") if uri else "/tmp/earthcatalog.db"

    if self._store and self._catalog_key:
        self.download_catalog(local_db)

    if not is_delta:
        from pyiceberg.exceptions import NoSuchTableError

        try:
            self._catalog.drop_table(FULL_NAME)
        except NoSuchTableError:
            pass
        try:
            self._catalog.create_namespace(NAMESPACE)
        except Exception:
            pass
        self._table = get_or_create(self._catalog, grid_config=grid_cfg)

    total_items = 0
    total_rows = 0
    written_keys: list[str] = []
    batch: list[tuple[str, str]] = []

    def _flush(chunk: list[tuple[str, str]]) -> None:
        nonlocal total_rows

        with ThreadPoolExecutor(max_workers=16) as pool:
            items = list(filter(None, pool.map(lambda bc: _fetch_item(*bc), chunk)))

        if not items:
            return

        fo = fan_out(items, partitioner)
        if not fo:
            return

        for (cell, year), group_items in group_by_partition(fo).items():
            year_str = str(year) if year is not None else "unknown"
            part_tag = uuid.uuid4().hex[:8]
            s3_key = f"grid_partition={cell}/year={year_str}/part_{part_tag}.parquet"
            n, _ = write_geoparquet_s3(group_items, self._store, s3_key)
            if n > 0:
                written_keys.append(s3_key)
                total_rows += n

    print(f"Ingesting from: {inventory_path}")
    for bucket, key in _iter_inventory(inventory_path, since=since):
        if not key.endswith(".stac.json"):
            continue
        batch.append((bucket, key))
        total_items += 1
        if len(batch) >= chunk_size:
            _flush(batch)
            batch.clear()
        if limit and total_items >= limit:
            break

    if batch:
        _flush(batch)

    if written_keys:
        full_paths = [f"{warehouse_root.rstrip('/')}/{k}" for k in written_keys]
        batch_sz = 2000
        for i in range(0, len(full_paths), batch_sz):
            self._table.add_files(full_paths[i : i + batch_sz])
        print(f"Registered {len(full_paths)} files in Iceberg catalog.")

    if update_hash_index and written_keys:
        hash_index_path = self._table.properties.get("earthcatalog.hash_index_path")
        if not hash_index_path:
            hash_index_path = f"{warehouse_root.rstrip('/')}_id_hashes.parquet"
            with self._table.transaction() as tx:
                tx.set_properties(**{"earthcatalog.hash_index_path": hash_index_path})

        if hash_index_path.startswith("s3://"):
            import re as _re

            m = _re.match(r"s3://([^/]+)/(.+)", hash_index_path)
            if m:
                hash_key = m.group(2)
                existing = read_hashes(self._store, hash_key)
                print(f"  Existing hashes: {len(existing):,}")
                updated, n_new = merge_hashes_from_parquets(
                    full_paths, existing, store=self._store
                )
                print(f"  New hashes: {n_new:,} from {len(full_paths)} files")
                write_hashes(updated, self._store, hash_key)
        else:
            print("WARN: hash index update skipped — only s3:// paths supported")

    if self._store and self._catalog_key:
        self.upload_catalog(local_db)

    result = {
        "items_processed": total_items,
        "rows_written": total_rows,
        "files_registered": len(written_keys),
    }
    print(f"Done. {total_items} items -> {total_rows} rows in {len(written_keys)} files")
    return result

`bulk_ingest(inventory_path, *, mode='auto', chunk_size=100000, compact_rows=100000, limit=None, since=None, update_hash_index=False, staging_prefix=None, create_client=None, skip_inventory=False, skip_ingest=False, retry_pending=False)` ¶

Ingest large inventories using a distributed Dask cluster.

Source code in earthcatalog/catalog.py

def bulk_ingest(
    self,
    inventory_path: str,
    *,
    mode: str = "auto",
    chunk_size: int = 100_000,
    compact_rows: int = 100_000,
    limit: int | None = None,
    since: datetime | None = None,
    update_hash_index: bool = False,
    staging_prefix: str | None = None,
    create_client: Callable[[], object] | None = None,
    skip_inventory: bool = False,
    skip_ingest: bool = False,
    retry_pending: bool = False,
) -> None:
    """Ingest large inventories using a distributed Dask cluster."""
    import os
    from datetime import UTC
    from datetime import datetime as _dt

    from earthcatalog.config import GridConfig
    from earthcatalog.grids import build_partitioner
    from earthcatalog.pipelines.backfill import run_backfill

    if not os.environ.get("AWS_ACCESS_KEY_ID"):
        raise RuntimeError(
            "No AWS credentials found in environment. "
            "bulk_ingest() requires write access to S3. "
            "Set AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or use an IAM role."
        )

    warehouse_root = self._catalog.properties.get("warehouse", "")
    uri = self._catalog.properties.get("uri", "")
    local_db = uri.removeprefix("sqlite:///")

    grid_cfg = GridConfig(
        type=self._info.grid_type,
        resolution=self._info.grid_resolution,
        boundaries_path=self._info.boundaries_path,
        id_field=self._info.id_field,
    )
    partitioner = build_partitioner(grid_cfg)

    if staging_prefix is None:
        date_str = _dt.now(UTC).strftime("%Y%m%d")
        staging_prefix = f"bulk_ingest/{date_str}"

    delta = True
    if mode == "full":
        delta = False
    elif mode == "auto":
        try:
            n = sum(s["row_count"] for s in self._info.stats(self._table))
            delta = n > 0
        except Exception:
            delta = False

    if self._store and self._catalog_key:
        self.download_catalog(local_db)

    from . import store_config

    old_store = store_config.get_store()
    old_key = store_config.get_catalog_key()
    try:
        store_config.set_store(self._store)
        if self._catalog_key:
            store_config.set_catalog_key(self._catalog_key)

        run_backfill(
            inventory_path=inventory_path,
            catalog_path=local_db,
            staging_store=self._store,
            staging_prefix=staging_prefix,
            warehouse_store=self._store,
            warehouse_root=warehouse_root,
            partitioner=partitioner,
            chunk_size=chunk_size,
            compact_rows=compact_rows,
            limit=limit,
            since=since,
            use_lock=False,
            upload=True,
            skip_inventory=skip_inventory,
            skip_ingest=skip_ingest,
            retry_pending=retry_pending,
            delta=delta,
            create_client=create_client,
            update_hash_index=update_hash_index,
            hash_index_path=self._table.properties.get("earthcatalog.hash_index_path"),
        )
    finally:
        store_config.set_store(old_store)
        store_config.set_catalog_key(old_key)

`download_catalog(local_path)` ¶

Download catalog.db from the backing store to local_path.

Source code in earthcatalog/catalog.py

def download_catalog(self, local_path: str) -> None:
    """Download catalog.db from the backing store to *local_path*."""
    download_catalog(local_path, store=self._store)

`upload_catalog(local_path)` ¶

Upload catalog.db from local_path to the backing store.

Source code in earthcatalog/catalog.py

def upload_catalog(self, local_path: str) -> None:
    """Upload catalog.db from *local_path* to the backing store."""
    upload_catalog(local_path, store=self._store)

`compact(threshold=2, dry_run=False)` ¶

Compact over-threshold partition buckets and rebuild the Iceberg catalog.

Wraps :func:earthcatalog.maintenance.compact.compact_warehouse using this catalog's warehouse path and local catalog database.

Parameters¶

threshold: Minimum number of part files in a bucket before it is compacted. Default: 2 (compact any bucket with more than one part file). dry_run: When True, report what would be compacted but make no changes.

Returns¶

Summary dict with keys buckets_scanned, buckets_compacted, files_before, files_after.

Source code in earthcatalog/catalog.py

def compact(
    self,
    threshold: int = 2,
    dry_run: bool = False,
) -> dict[str, int]:
    """Compact over-threshold partition buckets and rebuild the Iceberg catalog.

    Wraps :func:`earthcatalog.maintenance.compact.compact_warehouse` using this
    catalog's warehouse path and local catalog database.

    Parameters
    ----------
    threshold:
        Minimum number of part files in a bucket before it is compacted.
        Default: 2 (compact any bucket with more than one part file).
    dry_run:
        When ``True``, report what *would* be compacted but make no changes.

    Returns
    -------
    Summary dict with keys ``buckets_scanned``, ``buckets_compacted``,
    ``files_before``, ``files_after``.
    """
    from earthcatalog.maintenance.compact import compact_warehouse

    warehouse_path = self._catalog.properties.get("warehouse", "")
    uri = self._catalog.properties.get("uri", "")
    local_db = uri.removeprefix("sqlite:///")
    return compact_warehouse(
        warehouse_path=warehouse_path,
        catalog_path=local_db,
        threshold=threshold,
        dry_run=dry_run,
    )

`lock(owner, ttl_hours=12)` ¶

Return an S3Lock that uses this EarthCatalog's store and key.

Source code in earthcatalog/catalog.py

def lock(self, owner: str, ttl_hours: int = 12):
    """Return an S3Lock that uses this EarthCatalog's store and key."""
    from .lock import S3Lock

    lock_key = getattr(self._catalog, "_lock_key", None) or ".lock"
    return S3Lock(owner=owner, ttl_hours=ttl_hours, store=self._store, key=lock_key)

`cells_for_geometry(geom)` ¶

Return the partition keys that intersect geom.

Source code in earthcatalog/catalog.py

def cells_for_geometry(self, geom) -> list[str]:
    """Return the partition keys that intersect *geom*."""
    return self._info.cells_for_geometry(geom)

`cell_list_sql(geom)` ¶

Return a SQL fragment suitable for WHERE grid_partition IN (...).

Source code in earthcatalog/catalog.py

def cell_list_sql(self, geom) -> str:
    """Return a SQL fragment suitable for ``WHERE grid_partition IN (...)``."""
    return self._info.cell_list_sql(geom)

Functions¶

`download_catalog(local_path, store=None, catalog_key=None)` ¶

Pull catalog.db from store to local_path before a job starts.

Source code in earthcatalog/catalog.py

def download_catalog(
    local_path: str,
    store: object | None = None,
    catalog_key: str | None = None,
) -> None:
    """Pull catalog.db from *store* to *local_path* before a job starts."""
    if store is None or catalog_key is None:
        store = store_config.get_store()
        catalog_key = store_config.get_catalog_key()
    try:
        result = obstore.get(store, catalog_key)
        Path(local_path).write_bytes(bytes(result.bytes()))
        print(f"Catalog downloaded: {catalog_key} -> {local_path}")
    except FileNotFoundError:
        print(f"No existing catalog at '{catalog_key}' — will create fresh.")

`upload_catalog(local_path, store=None, catalog_key=None)` ¶

Push the updated catalog.db to store after all writes.

Source code in earthcatalog/catalog.py

def upload_catalog(
    local_path: str,
    store: object | None = None,
    catalog_key: str | None = None,
) -> None:
    """Push the updated catalog.db to *store* after all writes."""
    if store is None or catalog_key is None:
        store = store_config.get_store()
        catalog_key = store_config.get_catalog_key()
    obstore.put(store, catalog_key, Path(local_path).read_bytes())
    print(f"Catalog uploaded: {local_path} -> {catalog_key}")

`get_or_create(catalog, grid_config=None)` ¶

Return the stac_items table, creating it (and the namespace) if needed.

Parameters¶

catalog: Open SqlCatalog instance. grid_config: Optional :class:earthcatalog.config.GridConfig. When provided, grid metadata (type, resolution, boundaries_path, id_field) is stored as Iceberg table properties so that :class:CatalogInfo can reconstruct the grid system without any external configuration.

Source code in earthcatalog/catalog.py

def get_or_create(catalog: SqlCatalog, grid_config=None) -> object:
    """Return the stac_items table, creating it (and the namespace) if needed.

    Parameters
    ----------
    catalog:
        Open SqlCatalog instance.
    grid_config:
        Optional :class:`earthcatalog.config.GridConfig`.  When provided, grid
        metadata (type, resolution, boundaries_path, id_field) is stored as
        Iceberg table properties so that :class:`CatalogInfo`
        can reconstruct the grid system without any external configuration.
    """
    try:
        catalog.create_namespace(NAMESPACE)
    except NamespaceAlreadyExistsError:
        pass

    props: dict[str, str] = {}
    if grid_config is not None:
        props[PROP_GRID_TYPE] = str(grid_config.type)
        if grid_config.resolution is not None:
            props[PROP_GRID_RESOLUTION] = str(grid_config.resolution)
        if grid_config.boundaries_path is not None:
            props[PROP_GRID_BOUNDARIES_PATH] = str(grid_config.boundaries_path)
        if grid_config.id_field is not None:
            props[PROP_GRID_ID_FIELD] = str(grid_config.id_field)

    try:
        table = catalog.load_table(FULL_NAME)
        missing = {k: v for k, v in props.items() if k not in table.properties}
        if missing:
            with table.transaction() as tx:
                tx.set_properties(**missing)
        return table
    except NoSuchTableError:
        return catalog.create_table(
            identifier=FULL_NAME,
            schema=ICEBERG_SCHEMA,
            partition_spec=PARTITION_SPEC,
            properties=props,
        )

`open(store, base, *, anonymous=None)` ¶

Open an EarthCatalog backed by store at base.

Parameters¶

store: An obstore-compatible store (S3Store, LocalStore, etc.). All catalog I/O (download, upload) and warehouse file operations flow through this store. base: Base path containing: - earthcatalog.db (SQLite Iceberg catalog) - warehouse/ (GeoParquet files) Optionally: - warehouse_id_hashes.parquet (hash index) anonymous: Force anonymous S3 access when the warehouse path is s3://. Auto-detected for stores with skip_signature=True.

Returns¶

EarthCatalog Facade combining PyIceberg catalog, table, and grid metadata.

Source code in earthcatalog/catalog.py

def open(
    store: object,
    base: str,
    *,
    anonymous: bool | None = None,
) -> EarthCatalog:
    """Open an EarthCatalog backed by *store* at *base*.

    Parameters
    ----------
    store:
        An obstore-compatible store (``S3Store``, ``LocalStore``, etc.).
        All catalog I/O (download, upload) and warehouse file operations
        flow through this store.
    base:
        Base path containing:
        - ``earthcatalog.db``   (SQLite Iceberg catalog)
        - ``warehouse/``        (GeoParquet files)
        Optionally:
        - ``warehouse_id_hashes.parquet`` (hash index)
    anonymous:
        Force anonymous S3 access when the warehouse path is ``s3://``.
        Auto-detected for stores with ``skip_signature=True``.

    Returns
    -------
    EarthCatalog
        Facade combining PyIceberg catalog, table, and grid metadata.
    """
    import os
    import tempfile
    import uuid

    _warehouse_path = f"{base}/warehouse"

    if base.startswith("s3://"):
        rest = base[5:]
        parts = rest.split("/", 1)
        catalog_key = f"{parts[1]}/earthcatalog.db" if len(parts) > 1 else "earthcatalog.db"
    else:
        catalog_key = str(Path(base) / "earthcatalog.db")

    _db_path = str(Path(tempfile.gettempdir()) / f"earthcatalog_{uuid.uuid4().hex[:8]}.db")
    try:
        result = obstore.get(store, catalog_key)
        Path(_db_path).write_bytes(bytes(result.bytes()))
    except FileNotFoundError:
        pass

    if anonymous is None and hasattr(store, "config"):
        skip_sig = store.config.get("skip_signature")
        if skip_sig in (True, "true"):
            anonymous = True

    region = os.environ.get("AWS_DEFAULT_REGION") or os.environ.get("AWS_REGION") or "us-west-2"
    props: dict = {"uri": f"sqlite:///{_db_path}", "warehouse": _warehouse_path}

    if _warehouse_path.startswith("s3://"):
        props["s3.region"] = region
        if anonymous:
            props["s3.anonymous"] = "true"
            props["s3.endpoint"] = f"https://s3.{region}.amazonaws.com"
        else:
            props["s3.anonymous"] = "true"
            props["s3.endpoint"] = f"https://s3.{region}.amazonaws.com"

    sql_catalog = SqlCatalog(NAMESPACE, **props)
    table = get_or_create(sql_catalog)
    return EarthCatalog(
        catalog=sql_catalog,
        table=table,
        info=_catalog_info(table),
        store=store,
        catalog_key=catalog_key,
    )

`ingest(inventory_path, *, store=None, base=None, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False)` ¶

Open an EarthCatalog and ingest STAC items from an inventory.

Convenience wrapper around EarthCatalog.ingest() for callers that only have a store and base path.

Parameters¶

inventory_path: Path or s3:// URI to an S3 Inventory file. store: An obstore-compatible store (S3Store, LocalStore, etc.). base: Base path containing earthcatalog.db and warehouse/. mode: "auto", "full", or "delta". See EarthCatalog.ingest. chunk_size: Items per fetch batch. limit: Max items to process. since: Only process items modified after this datetime. update_hash_index: Update the warehouse hash index after ingest.

Returns¶

dict with keys items_processed, rows_written, files_registered.

Source code in earthcatalog/catalog.py

def ingest(
    inventory_path: str,
    *,
    store: object | None = None,
    base: str | None = None,
    mode: str = "auto",
    chunk_size: int = 10000,
    limit: int | None = None,
    since: datetime | None = None,
    update_hash_index: bool = False,
) -> dict:
    """Open an EarthCatalog and ingest STAC items from an inventory.

    Convenience wrapper around ``EarthCatalog.ingest()`` for callers that
    only have a store and base path.

    Parameters
    ----------
    inventory_path:
        Path or ``s3://`` URI to an S3 Inventory file.
    store:
        An obstore-compatible store (``S3Store``, ``LocalStore``, etc.).
    base:
        Base path containing ``earthcatalog.db`` and ``warehouse/``.
    mode:
        ``"auto"``, ``"full"``, or ``"delta"``.  See ``EarthCatalog.ingest``.
    chunk_size:
        Items per fetch batch.
    limit:
        Max items to process.
    since:
        Only process items modified after this datetime.
    update_hash_index:
        Update the warehouse hash index after ingest.

    Returns
    -------
    dict with keys ``items_processed``, ``rows_written``, ``files_registered``.
    """
    ec = open(store=store, base=base)
    return ec.ingest(
        inventory_path=inventory_path,
        mode=mode,
        chunk_size=chunk_size,
        limit=limit,
        since=since,
        update_hash_index=update_hash_index,
    )

earthcatalog.catalog.ICEBERG_SCHEMA = Schema(NestedField(1, 'id', StringType(), required=False), NestedField(2, 'grid_partition', StringType(), required=False), NestedField(3, 'geometry', BinaryType(), required=False), NestedField(4, 'datetime', TimestamptzType(), required=False), NestedField(5, 'platform', StringType(), required=False), NestedField(6, 'percent_valid_pixels', LongType(), required=False), NestedField(7, 'date_dt', LongType(), required=False), NestedField(8, 'proj:code', StringType(), required=False), NestedField(9, 'assets', StringType(), required=False), NestedField(10, 'links', StringType(), required=False), NestedField(11, 'stac_version', StringType(), required=False), NestedField(12, 'type', StringType(), required=False), NestedField(13, 'start_datetime', TimestamptzType(), required=False), NestedField(14, 'version', StringType(), required=False), NestedField(15, 'sat:orbit_state', StringType(), required=False), NestedField(16, 'scene_1_id', StringType(), required=False), NestedField(17, 'scene_2_id', StringType(), required=False), NestedField(18, 'scene_1_frame', StringType(), required=False), NestedField(19, 'scene_2_frame', StringType(), required=False), NestedField(20, 'mid_datetime', StringType(), required=False), NestedField(21, 'created', TimestamptzType(), required=False), NestedField(22, 'updated', TimestamptzType(), required=False), NestedField(23, 'end_datetime', TimestamptzType(), required=False), NestedField(24, 'stac_extensions', StringType(), required=False), NestedField(25, 'collection', StringType(), required=False), NestedField(26, 'latitude', DoubleType(), required=False), NestedField(27, 'longitude', DoubleType(), required=False), NestedField(28, 'bbox', StringType(), required=False)) `module-attribute` ¶

`earthcatalog.catalog.PARTITION_SPEC = PartitionSpec(PartitionField(source_id=2, field_id=100, transform=(IdentityTransform()), name='grid_partition'), PartitionField(source_id=4, field_id=101, transform=(YearTransform()), name='year'))` `module-attribute` ¶

`earthcatalog.transform` ¶

STAC item transformation: H3 fan-out + stac-geoparquet writing via rustac.

Public functions¶

fan_out(items, partitioner) Produce one synthetic STAC item per (source_item × grid_cell) pair. Injects grid_partition into each item's properties.

group_by_partition(fan_out_items) Group the output of fan_out() by (grid_partition, year) so that each group can be written to exactly one Parquet file. This is required for Iceberg IdentityTransform + YearTransform partition pruning.

write_geoparquet(fan_out_items, path) Write a single-partition list of synthetic items to a GeoParquet file using rustac.write(). rustac writes proper stac-geoparquet with: - assets as struct column - links as list column - properties promoted to top-level columns - geoarrow.wkb extension on geometry column

Spatial predicate pushdown¶

The correct usage pattern for spatial queries:

Convert the query geometry to grid cell IDs (e.g. H3 cells at resolution 1): candidate_cells = h3.geo_to_cells(mapping(query_geom), resolution=1)
Filter the Iceberg table via: WHERE grid_partition IN () Iceberg's IdentityTransform partition pruning will skip all files whose grid_partition value is not in the candidate set.

Classes¶

`FileMetadata` `dataclass` ¶

Lightweight record describing one GeoParquet file written to a store.

Designed to be trivially serialisable by Dask (no PyArrow, no Iceberg imports) so it crosses the network from worker to head node at ~200 B/file.

Attributes¶

s3_key: Key relative to the warehouse store root — e.g. "grid_partition=81003ffffffffff/year=2025/part_000000_abc1.parquet". The caller that knows the store root appends this to construct the full URI used in table.add_files(). grid_partition: H3 cell string (or "__none__" for unlocated items). year: 4-digit calendar year from the datetime property, or None for items without a parseable datetime. row_count: Number of rows in the file. file_size_bytes: Byte size of the written Parquet file.

Source code in earthcatalog/transform.py

@dataclass
class FileMetadata:
    """
    Lightweight record describing one GeoParquet file written to a store.

    Designed to be trivially serialisable by Dask (no PyArrow, no Iceberg
    imports) so it crosses the network from worker to head node at ~200 B/file.

    Attributes
    ----------
    s3_key:
        Key relative to the warehouse store root — e.g.
        ``"grid_partition=81003ffffffffff/year=2025/part_000000_abc1.parquet"``.
        The caller that knows the store root appends this to construct the full
        URI used in ``table.add_files()``.
    grid_partition:
        H3 cell string (or ``"__none__"`` for unlocated items).
    year:
        4-digit calendar year from the ``datetime`` property, or ``None`` for
        items without a parseable datetime.
    row_count:
        Number of rows in the file.
    file_size_bytes:
        Byte size of the written Parquet file.
    """

    s3_key: str
    grid_partition: str
    year: int | None
    row_count: int
    file_size_bytes: int

Functions¶

`fan_out(stac_items, partitioner)` ¶

Produce one synthetic STAC item per (source_item × grid_cell) pair.

Each synthetic item is the original STAC item with grid_partition injected into its properties. All original fields (assets, links, collection, …) are preserved as-is so that rustac.write() can emit a complete stac-geoparquet file with the native rustac schema.

Items with unparseable or empty geometry are silently skipped.

Source code in earthcatalog/transform.py

def fan_out(
    stac_items: list[dict],
    partitioner: AbstractPartitioner,
) -> list[dict]:
    """
    Produce one synthetic STAC item per (source_item × grid_cell) pair.

    Each synthetic item is the original STAC item with ``grid_partition``
    injected into its ``properties``.  All original fields (assets, links,
    collection, …) are preserved as-is so that rustac.write() can emit a
    complete stac-geoparquet file with the native rustac schema.

    Items with unparseable or empty geometry are silently skipped.
    """
    from shapely.geometry import shape  # deferred — heavy library

    result: list[dict] = []
    for item in stac_items:
        props = item.get("properties", {})
        try:
            geom = shape(item["geometry"])
            keys = partitioner.get_intersecting_keys(geom.wkb) or ["__none__"]
        except Exception:
            continue

        for key in keys:
            synthetic = {**item, "properties": {**props, "grid_partition": key}}
            result.append(synthetic)

    return result

`group_by_partition(fan_out_items)` ¶

Group fan-out items by (grid_partition, year) and sort each group by (platform, datetime).

Each resulting group satisfies both Iceberg partition constraints:

IdentityTransform on grid_partition — every item in the group has the same grid_partition value, so Parquet column statistics give a single min == max that add_files() can use unambiguously.
YearTransform on datetime — every item in the group has a datetime in the same calendar year, so the year-level Parquet statistics are also unambiguous.

The within-group sort by (platform, datetime) maximises Parquet row-group min/max statistics for predicate pushdown on those columns.

Parameters¶

fan_out_items: Output of :func:fan_out — each item has exactly one grid_partition value in its properties.

Returns¶

dict mapping (cell_id, year) → sorted list of synthetic STAC items. year is None for items that carry no datetime property.

Source code in earthcatalog/transform.py

def group_by_partition(
    fan_out_items: list[dict],
) -> dict[tuple[str, int | None], list[dict]]:
    """
    Group fan-out items by ``(grid_partition, year)`` and sort each group by
    ``(platform, datetime)``.

    Each resulting group satisfies both Iceberg partition constraints:

    * ``IdentityTransform`` on ``grid_partition`` — every item in the group
      has the same ``grid_partition`` value, so Parquet column statistics give
      a single min == max that ``add_files()`` can use unambiguously.
    * ``YearTransform`` on ``datetime`` — every item in the group has a
      ``datetime`` in the same calendar year, so the year-level Parquet
      statistics are also unambiguous.

    The within-group sort by ``(platform, datetime)`` maximises Parquet
    row-group min/max statistics for predicate pushdown on those columns.

    Parameters
    ----------
    fan_out_items:
        Output of :func:`fan_out` — each item has exactly one
        ``grid_partition`` value in its ``properties``.

    Returns
    -------
    dict mapping ``(cell_id, year)`` → sorted list of synthetic STAC items.
    ``year`` is ``None`` for items that carry no ``datetime`` property.
    """
    groups: dict[tuple[str, int | None], list[dict]] = {}
    for item in fan_out_items:
        cell = item["properties"].get("grid_partition", "__none__")
        year = _year_from_item(item)
        key = (cell, year)
        groups.setdefault(key, []).append(item)

    # Sort within each group for optimal Parquet column statistics
    for key in groups:
        groups[key].sort(key=_sort_key)

    return groups

`write_geoparquet(fan_out_items, path)` ¶

Write fan-out STAC items to a GeoParquet file using rustac.

Caller's responsibility¶

Pass items for a single (grid_partition, year) group — i.e. the output of one iteration over :func:group_by_partition. If items span multiple partitions the resulting file will violate the Iceberg IdentityTransform constraint and table.add_files() will raise a ValueError.

rustac writes the full stac-geoparquet schema. A post-processing step casts struct/list columns (assets, links) to JSON strings and drops null- typed columns (collection) so the file is compatible with PyIceberg V2 add_files().

Returns the number of rows written (0 if the input list is empty).

Source code in earthcatalog/transform.py

def write_geoparquet(fan_out_items: list[dict], path: str) -> int:
    """
    Write fan-out STAC items to a GeoParquet file using rustac.

    Caller's responsibility
    -----------------------
    Pass items for **a single (grid_partition, year) group** — i.e. the output
    of one iteration over :func:`group_by_partition`.  If items span multiple
    partitions the resulting file will violate the Iceberg ``IdentityTransform``
    constraint and ``table.add_files()`` will raise a ``ValueError``.

    rustac writes the full stac-geoparquet schema.  A post-processing step
    casts struct/list columns (assets, links) to JSON strings and drops null-
    typed columns (collection) so the file is compatible with PyIceberg V2
    ``add_files()``.

    Returns the number of rows written (0 if the input list is empty).
    """
    if not fan_out_items:
        return 0

    import pyarrow.parquet as pq

    async def _write():
        await rustac.write(path, fan_out_items)

    loop = asyncio.new_event_loop()
    try:
        loop.run_until_complete(_write())
    finally:
        loop.close()

    # Post-process: cast assets/links → JSON strings, drop null columns,
    # cast whole-number floats → int32.  Field metadata (e.g. geoarrow.wkb
    # extension on geometry) is preserved by _normalize_for_iceberg.
    # File-level keys written by rustac (geo, stac-geoparquet) live in the
    # Parquet file metadata, not the Arrow schema metadata.  We carry them
    # into the Arrow schema so pq.write_table re-encodes them.
    # Use ParquetFile to read the single file exactly — avoids PyArrow's
    # Hive-partition directory discovery which breaks inside partitioned layouts.
    pf = pq.ParquetFile(path)
    file_meta = pf.metadata.metadata
    table = pf.read()
    table = _normalize_for_iceberg(table)
    preserve_keys = (b"geo", b"stac-geoparquet")
    extra = {k: v for k, v in file_meta.items() if k in preserve_keys}
    if extra:
        merged = {**(table.schema.metadata or {}), **extra}
        table = table.replace_schema_metadata(merged)
    # Write via a file object rather than a path string to prevent PyArrow's
    # Parquet reader/writer from treating the parent directory as a Hive-
    # partitioned dataset (which causes schema-merge errors inside layouts
    # like grid_partition=X/year=Y/).  S3 paths never reach this code path —
    # write_geoparquet_s3 writes to a local temp file then uploads via obstore.
    with open(path, "wb") as _fh:
        pq.write_table(table, _fh, compression="zstd")

    return len(fan_out_items)

`write_geoparquet_s3(fan_out_items, store, s3_key)` ¶

Write a single-partition list of fan-out items as GeoParquet to a store.

Writes to a local temporary file (via :func:write_geoparquet) then uploads the bytes via obstore.put. The temporary file is always deleted, even on error.

This is the S3-capable counterpart to :func:write_geoparquet. Workers on a Dask cluster call this function directly; the store is injected so the function is testable with any obstore-compatible backend (MemoryStore, LocalStore, S3Store).

Parameters¶

fan_out_items: Output of one group_by_partition() iteration — all items must belong to the same (grid_partition, year) group. store: An obstore-compatible store (S3Store, LocalStore, or MemoryStore). s3_key: Key within the store, e.g. "grid_partition=81003ffffffffff/year=2025/part_000000_abc1.parquet".

Returns¶

(row_count, byte_count) — both zero if fan_out_items is empty (no file is uploaded in that case).

Source code in earthcatalog/transform.py

def write_geoparquet_s3(
    fan_out_items: list[dict],
    store: object,
    s3_key: str,
) -> tuple[int, int]:
    """
    Write a **single-partition** list of fan-out items as GeoParquet to a store.

    Writes to a local temporary file (via :func:`write_geoparquet`) then
    uploads the bytes via ``obstore.put``.  The temporary file is always
    deleted, even on error.

    This is the S3-capable counterpart to :func:`write_geoparquet`.  Workers
    on a Dask cluster call this function directly; the store is injected so
    the function is testable with any ``obstore``-compatible backend
    (``MemoryStore``, ``LocalStore``, ``S3Store``).

    Parameters
    ----------
    fan_out_items:
        Output of one ``group_by_partition()`` iteration — all items must
        belong to the same ``(grid_partition, year)`` group.
    store:
        An ``obstore``-compatible store (``S3Store``, ``LocalStore``, or
        ``MemoryStore``).
    s3_key:
        Key within the store, e.g.
        ``"grid_partition=81003ffffffffff/year=2025/part_000000_abc1.parquet"``.

    Returns
    -------
    ``(row_count, byte_count)`` — both zero if *fan_out_items* is empty (no
    file is uploaded in that case).
    """
    if not fan_out_items:
        return 0, 0

    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
        tmp_path = tmp.name
    try:
        n = write_geoparquet(fan_out_items, tmp_path)
        if n == 0:
            return 0, 0
        data = Path(tmp_path).read_bytes()
        obstore.put(store, s3_key, data)
        return n, len(data)
    finally:
        Path(tmp_path).unlink(missing_ok=True)

`earthcatalog.lock` ¶

S3 atomic lockfile using conditional writes (If-None-Match: *).

Prevents concurrent writes to the SQLite catalog.db.

Uses the store configured in earthcatalog.store_config (defaults to LocalStore for zero-config local development and testing). Override the store before running a job:

from earthcatalog import store_config
from obstore.store import S3Store

store_config.set_store(S3Store(bucket="my-bucket", region="us-west-2"))
store_config.set_lock_key("catalog/.lock")

Usage

from earthcatalog.lock import S3Lock

with S3Lock(owner="incremental"): download_catalog(...) ... do work ... upload_catalog(...)

Classes¶

`CatalogLocked` ¶

Bases: RuntimeError

Raised when the lock is held by another process.

Source code in earthcatalog/lock.py

class CatalogLocked(RuntimeError):
    """Raised when the lock is held by another process."""

`S3Lock` ¶

Atomic lockfile using obstore conditional writes (If-None-Match: *).

When store and key are provided explicitly they are used directly; otherwise falls back to the global :mod:earthcatalog.store_config (deprecated path).

Stale locks (older than ttl_hours) are automatically overridden.

Source code in earthcatalog/lock.py

class S3Lock:
    """
    Atomic lockfile using obstore conditional writes (If-None-Match: *).

    When *store* and *key* are provided explicitly they are used directly;
    otherwise falls back to the global :mod:`earthcatalog.store_config`
    (deprecated path).

    Stale locks (older than ttl_hours) are automatically overridden.
    """

    def __init__(
        self,
        owner: str,
        ttl_hours: int = 12,
        store: object | None = None,
        key: str | None = None,
    ) -> None:
        """
        Args:
            owner:     Human-readable name for the lock holder (e.g. "backfill").
            ttl_hours: Age after which a lock is considered stale and overridable.
            store:     Optional explicit obstore store (avoids store_config globals).
            key:       Optional explicit lock key (avoids store_config globals).
        """
        self._owner = owner
        self._ttl = ttl_hours
        self._explicit_store = store
        self._explicit_key = key

    def __enter__(self) -> "S3Lock":
        self.acquire()
        return self

    def __exit__(self, *_: object) -> None:
        self.release()

    @property
    def _store(self) -> object:
        if self._explicit_store is not None:
            return self._explicit_store
        return store_config.get_store()

    @property
    def _key(self) -> str:
        if self._explicit_key is not None:
            return self._explicit_key
        return store_config.get_lock_key()

    def acquire(self) -> None:
        """
        Atomically acquire the lock via mode='create' (If-None-Match: *).

        Succeeds only if the key does not exist. On conflict, reads the
        existing lock; if stale, deletes and retries. Raises CatalogLocked
        if a fresh lock is held by another process.
        """
        payload = self._make_payload()

        try:
            obstore.put(self._store, self._key, payload, mode="create")
            print(f"Lock acquired by '{self._owner}'.")
            return
        except AlreadyExistsError:
            pass

        # Key exists — read it to decide what to do
        existing = self._read_lock()
        if existing is None:
            # Disappeared between our failed PUT and this GET — retry once
            obstore.put(self._store, self._key, payload, mode="create")
            print(f"Lock acquired by '{self._owner}' (second attempt).")
            return

        acquired_at = datetime.fromisoformat(existing["acquired"])
        age = datetime.now(UTC) - acquired_at
        ttl = timedelta(hours=existing.get("ttl_hours", self._ttl))

        if age >= ttl:
            print(
                f"WARNING: Overriding stale lock from '{existing['owner']}' "
                f"on {existing['hostname']} (age: {age}, TTL: {ttl})."
            )
            obstore.delete(self._store, self._key)
            obstore.put(self._store, self._key, payload, mode="create")
            print(f"Lock acquired by '{self._owner}' (after stale override).")
            return

        raise CatalogLocked(
            f"Catalog is locked by '{existing['owner']}' on "
            f"{existing['hostname']} since {existing['acquired']}. "
            f"Lock expires in {ttl - age}."
        )

    def release(self) -> None:
        try:
            obstore.delete(self._store, self._key)
            print(f"Lock released by '{self._owner}'.")
        except Exception:
            pass  # already gone — fine

    def _make_payload(self) -> bytes:
        return json.dumps(
            {
                "owner": self._owner,
                "pid": os.getpid(),
                "hostname": socket.gethostname(),
                "acquired": datetime.now(UTC).isoformat(),
                "ttl_hours": self._ttl,
            }
        ).encode()

    def _read_lock(self) -> dict | None:
        try:
            result = obstore.get(self._store, self._key)
            return json.loads(bytes(result.bytes()))
        except FileNotFoundError:
            return None

Functions¶

`init(owner, ttl_hours=12, store=None, key=None)` ¶

Parameters:

Name	Type	Description	Default
`owner`	`str`	Human-readable name for the lock holder (e.g. "backfill").	required
`ttl_hours`	`int`	Age after which a lock is considered stale and overridable.	`12`
`store`	`object \| None`	Optional explicit obstore store (avoids store_config globals).	`None`
`key`	`str \| None`	Optional explicit lock key (avoids store_config globals).	`None`

Source code in earthcatalog/lock.py

def __init__(
    self,
    owner: str,
    ttl_hours: int = 12,
    store: object | None = None,
    key: str | None = None,
) -> None:
    """
    Args:
        owner:     Human-readable name for the lock holder (e.g. "backfill").
        ttl_hours: Age after which a lock is considered stale and overridable.
        store:     Optional explicit obstore store (avoids store_config globals).
        key:       Optional explicit lock key (avoids store_config globals).
    """
    self._owner = owner
    self._ttl = ttl_hours
    self._explicit_store = store
    self._explicit_key = key

`acquire()` ¶

Atomically acquire the lock via mode='create' (If-None-Match: *).

Succeeds only if the key does not exist. On conflict, reads the existing lock; if stale, deletes and retries. Raises CatalogLocked if a fresh lock is held by another process.

Source code in earthcatalog/lock.py

def acquire(self) -> None:
    """
    Atomically acquire the lock via mode='create' (If-None-Match: *).

    Succeeds only if the key does not exist. On conflict, reads the
    existing lock; if stale, deletes and retries. Raises CatalogLocked
    if a fresh lock is held by another process.
    """
    payload = self._make_payload()

    try:
        obstore.put(self._store, self._key, payload, mode="create")
        print(f"Lock acquired by '{self._owner}'.")
        return
    except AlreadyExistsError:
        pass

    # Key exists — read it to decide what to do
    existing = self._read_lock()
    if existing is None:
        # Disappeared between our failed PUT and this GET — retry once
        obstore.put(self._store, self._key, payload, mode="create")
        print(f"Lock acquired by '{self._owner}' (second attempt).")
        return

    acquired_at = datetime.fromisoformat(existing["acquired"])
    age = datetime.now(UTC) - acquired_at
    ttl = timedelta(hours=existing.get("ttl_hours", self._ttl))

    if age >= ttl:
        print(
            f"WARNING: Overriding stale lock from '{existing['owner']}' "
            f"on {existing['hostname']} (age: {age}, TTL: {ttl})."
        )
        obstore.delete(self._store, self._key)
        obstore.put(self._store, self._key, payload, mode="create")
        print(f"Lock acquired by '{self._owner}' (after stale override).")
        return

    raise CatalogLocked(
        f"Catalog is locked by '{existing['owner']}' on "
        f"{existing['hostname']} since {existing['acquired']}. "
        f"Lock expires in {ttl - age}."
    )

`earthcatalog.store_config` ¶

Global store configuration for earthcatalog.

Defaults to a LocalStore rooted at /tmp/earthcatalog_store for zero-config local development and testing. Override before running any job:

from earthcatalog import store_config
from obstore.store import S3Store

store_config.set_store(S3Store(bucket="my-bucket", region="us-west-2"))
store_config.set_catalog_key("catalog/catalog.db")
store_config.set_lock_key("catalog/.lock")

Functions¶

`get_store()` ¶

Return the active obstore-compatible store.

Source code in earthcatalog/store_config.py

def get_store() -> object:
    """Return the active obstore-compatible store."""
    return _store

`set_store(store)` ¶

Override the store backend (e.g. S3Store for production).

Source code in earthcatalog/store_config.py

def set_store(store: object) -> None:
    """Override the store backend (e.g. S3Store for production)."""
    global _store
    _store = store

`earthcatalog.partitioner` ¶

Abstract base class for spatial partitioners.

A partitioner maps a WKB geometry to one or more grid cell keys. The boundary-inclusive contract means that a geometry touching a cell boundary is assigned to that cell, preventing coverage gaps along shared edges.

Built-in implementations¶

:class:~earthcatalog.grids.h3_partitioner.H3Partitioner — Uber H3 hexagonal grid
:class:~earthcatalog.grids.geojson_partitioner.GeoJSONPartitioner — arbitrary polygon regions

Custom partitioners¶

Subclass :class:AbstractPartitioner and implement :meth:get_intersecting_keys, then pass an instance to :func:~earthcatalog.transform.fan_out.

Classes¶

`AbstractPartitioner` ¶

Bases: ABC

Given a WKB geometry, return the set of grid cell keys whose boundaries intersect that geometry. A single item may map to multiple keys (the Overlap Multiplier).

Source code in earthcatalog/partitioner.py

class AbstractPartitioner(ABC):
    """
    Given a WKB geometry, return the set of grid cell keys whose boundaries
    intersect that geometry. A single item may map to multiple keys (the
    Overlap Multiplier).
    """

    @abstractmethod
    def get_intersecting_keys(self, geom_wkb: bytes) -> list[str]:
        """Return grid cell IDs that intersect the given WKB geometry."""
        ...

Functions¶

`get_intersecting_keys(geom_wkb)` `abstractmethod` ¶

Return grid cell IDs that intersect the given WKB geometry.

Source code in earthcatalog/partitioner.py

@abstractmethod
def get_intersecting_keys(self, geom_wkb: bytes) -> list[str]:
    """Return grid cell IDs that intersect the given WKB geometry."""
    ...

Core API¶

earthcatalog.catalog.EarthCatalog ¶

Attributes¶

grid_type property ¶

grid_resolution property ¶

table property ¶

Functions¶

__init__(catalog, table, info, store=None, *, catalog_key=None) ¶

search_files(geom, start_datetime=None, end_datetime=None) ¶

search(**kwargs) ¶

Performance¶

Returns¶

search_to_arrow(**kwargs) ¶

search_uris(**kwargs) ¶

duck_search(**kwargs) ¶

stats() ¶

unique_item_count() ¶

info() ¶

ingest(inventory_path, *, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False) ¶

bulk_ingest(inventory_path, *, mode='auto', chunk_size=100000, compact_rows=100000, limit=None, since=None, update_hash_index=False, staging_prefix=None, create_client=None, skip_inventory=False, skip_ingest=False, retry_pending=False) ¶

download_catalog(local_path) ¶

upload_catalog(local_path) ¶

compact(threshold=2, dry_run=False) ¶

Parameters¶

Returns¶

lock(owner, ttl_hours=12) ¶

cells_for_geometry(geom) ¶

cell_list_sql(geom) ¶

earthcatalog.catalog ¶

Classes¶

CatalogInfo dataclass ¶

Functions¶

cells_for_geometry(geom) ¶

cell_list_sql(geom) ¶

file_paths(table, geom, start_datetime=None, end_datetime=None) ¶

stats(table) ¶

top_cells(table, limit=5) ¶

total_files(table) ¶

unique_item_count(table, store, default_hash_index_path=None) ¶

EarthCatalog ¶

Attributes¶

grid_type property ¶

grid_resolution property ¶

table property ¶

Functions¶

search_files(geom, start_datetime=None, end_datetime=None) ¶

search(**kwargs) ¶

Performance¶

Returns¶

search_to_arrow(**kwargs) ¶

search_uris(**kwargs) ¶

duck_search(**kwargs) ¶

stats() ¶

unique_item_count() ¶

info() ¶

ingest(inventory_path, *, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False) ¶

bulk_ingest(inventory_path, *, mode='auto', chunk_size=100000, compact_rows=100000, limit=None, since=None, update_hash_index=False, staging_prefix=None, create_client=None, skip_inventory=False, skip_ingest=False, retry_pending=False) ¶

download_catalog(local_path) ¶

upload_catalog(local_path) ¶

compact(threshold=2, dry_run=False) ¶

Parameters¶

Returns¶

lock(owner, ttl_hours=12) ¶

cells_for_geometry(geom) ¶

cell_list_sql(geom) ¶

Functions¶

download_catalog(local_path, store=None, catalog_key=None) ¶

upload_catalog(local_path, store=None, catalog_key=None) ¶

get_or_create(catalog, grid_config=None) ¶

Parameters¶

open(store, base, *, anonymous=None) ¶

Parameters¶

Returns¶

ingest(inventory_path, *, store=None, base=None, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False) ¶

Parameters¶

Returns¶

earthcatalog.catalog.PARTITION_SPEC = PartitionSpec(PartitionField(source_id=2, field_id=100, transform=(IdentityTransform()), name='grid_partition'), PartitionField(source_id=4, field_id=101, transform=(YearTransform()), name='year')) module-attribute ¶

earthcatalog.transform ¶

Public functions¶

Spatial predicate pushdown¶

`earthcatalog.catalog.EarthCatalog` ¶

`grid_type` `property` ¶

`grid_resolution` `property` ¶

`table` `property` ¶

`init(catalog, table, info, store=None, *, catalog_key=None)` ¶

`search_files(geom, start_datetime=None, end_datetime=None)` ¶

`search(**kwargs)` ¶

`search_to_arrow(**kwargs)` ¶

`search_uris(**kwargs)` ¶

`duck_search(**kwargs)` ¶

`stats()` ¶

`unique_item_count()` ¶

`info()` ¶

`ingest(inventory_path, *, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False)` ¶

`bulk_ingest(inventory_path, *, mode='auto', chunk_size=100000, compact_rows=100000, limit=None, since=None, update_hash_index=False, staging_prefix=None, create_client=None, skip_inventory=False, skip_ingest=False, retry_pending=False)` ¶

`download_catalog(local_path)` ¶

`upload_catalog(local_path)` ¶

`compact(threshold=2, dry_run=False)` ¶

`lock(owner, ttl_hours=12)` ¶

`cells_for_geometry(geom)` ¶

`cell_list_sql(geom)` ¶

`earthcatalog.catalog` ¶

`CatalogInfo` `dataclass` ¶

`cells_for_geometry(geom)` ¶

`cell_list_sql(geom)` ¶

`file_paths(table, geom, start_datetime=None, end_datetime=None)` ¶

`stats(table)` ¶

`top_cells(table, limit=5)` ¶

`total_files(table)` ¶

`unique_item_count(table, store, default_hash_index_path=None)` ¶

`EarthCatalog` ¶

`grid_type` `property` ¶

`grid_resolution` `property` ¶

`table` `property` ¶

`search_files(geom, start_datetime=None, end_datetime=None)` ¶

`search(**kwargs)` ¶

`search_to_arrow(**kwargs)` ¶

`search_uris(**kwargs)` ¶

`duck_search(**kwargs)` ¶

`stats()` ¶

`unique_item_count()` ¶

`info()` ¶

`ingest(inventory_path, *, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False)` ¶

`bulk_ingest(inventory_path, *, mode='auto', chunk_size=100000, compact_rows=100000, limit=None, since=None, update_hash_index=False, staging_prefix=None, create_client=None, skip_inventory=False, skip_ingest=False, retry_pending=False)` ¶

`download_catalog(local_path)` ¶

`upload_catalog(local_path)` ¶

`compact(threshold=2, dry_run=False)` ¶

`lock(owner, ttl_hours=12)` ¶

`cells_for_geometry(geom)` ¶

`cell_list_sql(geom)` ¶

`download_catalog(local_path, store=None, catalog_key=None)` ¶

`upload_catalog(local_path, store=None, catalog_key=None)` ¶

`get_or_create(catalog, grid_config=None)` ¶

`open(store, base, *, anonymous=None)` ¶

`ingest(inventory_path, *, store=None, base=None, mode='auto', chunk_size=10000, limit=None, since=None, update_hash_index=False)` ¶

`earthcatalog.catalog.PARTITION_SPEC = PartitionSpec(PartitionField(source_id=2, field_id=100, transform=(IdentityTransform()), name='grid_partition'), PartitionField(source_id=4, field_id=101, transform=(YearTransform()), name='year'))` `module-attribute` ¶

`earthcatalog.transform` ¶

`FileMetadata` `dataclass` ¶

`fan_out(stac_items, partitioner)` ¶

`group_by_partition(fan_out_items)` ¶

`write_geoparquet(fan_out_items, path)` ¶

`write_geoparquet_s3(fan_out_items, store, s3_key)` ¶

`earthcatalog.lock` ¶

`CatalogLocked` ¶

`S3Lock` ¶

`init(owner, ttl_hours=12, store=None, key=None)` ¶

`acquire()` ¶

`earthcatalog.store_config` ¶

`get_store()` ¶

`set_store(store)` ¶

`earthcatalog.partitioner` ¶

`AbstractPartitioner` ¶

`get_intersecting_keys(geom_wkb)` `abstractmethod` ¶