Skip to main content

Pagination

Handle large data assets efficiently with pagination in the Carbon Arc SDK.

How Pagination Works

Most Carbon Arc APIs return paginated responses:

{
"data": [...], // Current page of results
"total": 1500, // Total records available
"page": 1, // Current page number
"page_size": 100 // Records per page
}

To get all data, loop through pages until you've collected all records.


Basic Usage

Single Page

from carbonarc import CarbonArcClient

client = CarbonArcClient(host="https://api.carbonarc.co", token="YOUR_TOKEN")

# Get first page
response = client.data.get_datasets(page=1, page_size=10)

print(f"Page: {response.get('page', 1)}")
print(f"Total Available: {response.get('total', 'N/A')}")
print(f"Records on this page: {len(response.get('datasets', []))}")

Collect All Pages

all_datasets = []
page = 1
page_size = 50

while True:
response = client.data.get_datasets(page=page, page_size=page_size)

data = response.get("datasets", [])

# No more data? Done
if not data:
break

all_datasets.extend(data)

# Check if we've collected everything
total = response.get("total", 0)
if len(all_datasets) >= total:
break

page += 1

print(f"Collected {len(all_datasets)} total datasets")

Pagination by Endpoint

Different endpoints use different response keys:

EndpointData KeyParameters
client.data.get_datasets()"datasets"page, page_size
client.data.get_data_dictionary()"data_dictionary" or "fields"varies
client.data.get_data_sample()"data" or "samples"varies
client.ontology.get_entities()"entities"page, page_size
client.ontology.get_entities_for_insight()"entities"page, page_size
client.explorer.get_framework_data()"data"page, page_size

Helper Functions

Generic Paginator

def fetch_all_pages(fetch_func, data_key, page_size=100, max_pages=None, **kwargs):
"""
Fetch all pages from a paginated API endpoint.

Args:
fetch_func: API method to call
data_key: Key in response containing the data
page_size: Records per page
max_pages: Optional limit on pages to fetch
**kwargs: Additional arguments to pass to the API

Returns:
List of all records
"""
all_data = []
page = 1

while True:
response = fetch_func(page=page, page_size=page_size, **kwargs)
data = response.get(data_key, [])

if not data:
break

all_data.extend(data)

total = response.get("total", 0)
if total and len(all_data) >= total:
break

if max_pages and page >= max_pages:
break

page += 1

return all_data

# Usage
all_datasets = fetch_all_pages(client.data.get_datasets, "datasets")
all_entities = fetch_all_pages(
client.ontology.get_entities,
"entities",
representation="company"
)

Paginate Entities

def fetch_all_entities(client, representation="company", max_pages=None):
"""Fetch all entities of a given type."""
all_entities = []
page = 1
page_size = 100

while True:
response = client.ontology.get_entities(
representation=representation,
page=page,
page_size=page_size
)

entities = response.get("entities", [])
if not entities:
break

all_entities.extend(entities)

total = response.get("total", 0)
if total and len(all_entities) >= total:
break

if max_pages and page >= max_pages:
break

page += 1

return all_entities

# Get all companies
companies = fetch_all_entities(client, representation="company")

Paginate Framework Data

import pandas as pd

def fetch_all_framework_data(client, framework_id, page_size=10000):
"""
Fetch all data from a purchased framework.

Args:
client: CarbonArcClient instance
framework_id: ID of the purchased framework
page_size: Records per page (use larger for efficiency)

Returns:
pandas DataFrame with all data
"""
all_data = []
page = 1

while True:
response = client.explorer.get_framework_data(
framework_id=framework_id,
page=page,
page_size=page_size
)

data = response.get("data", [])
if not data:
break

all_data.extend(data)

total = response.get("total", 0)
pct = (len(all_data) / total * 100) if total else 0
print(f" Page {page}: {len(all_data):,}/{total:,} ({pct:.1f}%)")

if total and len(all_data) >= total:
break

page += 1

return pd.DataFrame(all_data)

# Usage
df = fetch_all_framework_data(client, "fw_abc123")
df.to_parquet("framework_data.parquet")

With Progress Bar

pip install tqdm
from tqdm import tqdm
import math

def fetch_with_progress(fetch_func, data_key, page_size=100, **kwargs):
"""Fetch all pages with a progress bar."""

# First call to get total
first_response = fetch_func(page=1, page_size=page_size, **kwargs)
total = first_response.get("total", 0)
first_data = first_response.get(data_key, [])

if not first_data:
return []

all_data = list(first_data)

if len(all_data) >= total:
return all_data

total_pages = math.ceil(total / page_size)

for page in tqdm(range(2, total_pages + 1), desc="Fetching", initial=1, total=total_pages):
response = fetch_func(page=page, page_size=page_size, **kwargs)
data = response.get(data_key, [])

if not data:
break

all_data.extend(data)

if len(all_data) >= total:
break

return all_data

# Usage
all_datasets = fetch_with_progress(client.data.get_datasets, "datasets")

Memory-Efficient Generator

For very large data assets, process records one at a time:

def paginate_generator(fetch_func, data_key, page_size=100, **kwargs):
"""
Generator that yields records one page at a time.
Memory efficient - doesn't load all data at once.
"""
page = 1

while True:
response = fetch_func(page=page, page_size=page_size, **kwargs)
data = response.get(data_key, [])

if not data:
break

for record in data:
yield record

total = response.get("total", 0)
if total and (page * page_size) >= total:
break

page += 1

# Process records one at a time
for dataset in paginate_generator(client.data.get_datasets, "datasets"):
print(dataset.get("dataset_id"))

With Retry Logic

Handle transient failures gracefully:

import time

def fetch_all_with_retry(fetch_func, data_key, page_size=100, max_retries=3, **kwargs):
"""Fetch all pages with automatic retry on failure."""
all_data = []
page = 1

while True:
# Retry loop
for attempt in range(max_retries):
try:
response = fetch_func(page=page, page_size=page_size, **kwargs)
break
except Exception as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Page {page} failed, retrying in {wait_time}s...")
time.sleep(wait_time)
else:
raise

data = response.get(data_key, [])
if not data:
break

all_data.extend(data)

total = response.get("total", 0)
if total and len(all_data) >= total:
break

page += 1

return all_data

Complete Paginator Class

All-in-one solution:

import pandas as pd
import time

class CarbonArcPaginator:
"""Reusable paginator for Carbon Arc API endpoints."""

def __init__(self, client, default_page_size=100, max_retries=3, show_progress=True):
self.client = client
self.default_page_size = default_page_size
self.max_retries = max_retries
self.show_progress = show_progress

def fetch_all(self, fetch_func, data_key, page_size=None, max_pages=None, **kwargs):
"""Fetch all pages and return combined list."""
page_size = page_size or self.default_page_size
all_data = []
page = 1

while True:
response = self._fetch_with_retry(fetch_func, page, page_size, **kwargs)

data = response.get(data_key, [])
if not data:
break

all_data.extend(data)

total = response.get("total", 0)
if self.show_progress and total:
pct = len(all_data) / total * 100
print(f" Progress: {len(all_data):,}/{total:,} ({pct:.1f}%)")

if total and len(all_data) >= total:
break
if max_pages and page >= max_pages:
break

page += 1

return all_data

def fetch_dataframe(self, fetch_func, data_key, **kwargs):
"""Fetch all pages and return as DataFrame."""
data = self.fetch_all(fetch_func, data_key, **kwargs)
return pd.DataFrame(data)

def iterate(self, fetch_func, data_key, page_size=None, **kwargs):
"""Generator that yields records one at a time."""
page_size = page_size or self.default_page_size
page = 1

while True:
response = self._fetch_with_retry(fetch_func, page, page_size, **kwargs)
data = response.get(data_key, [])

if not data:
break

for record in data:
yield record

total = response.get("total", 0)
if total and (page * page_size) >= total:
break

page += 1

def _fetch_with_retry(self, fetch_func, page, page_size, **kwargs):
"""Fetch a single page with retry logic."""
for attempt in range(self.max_retries):
try:
return fetch_func(page=page, page_size=page_size, **kwargs)
except Exception as e:
if attempt < self.max_retries - 1:
wait = 2 ** attempt
print(f"Retry {attempt + 1}/{self.max_retries} in {wait}s...")
time.sleep(wait)
else:
raise

# Usage
paginator = CarbonArcPaginator(client)

# Get all datasets as list
datasets = paginator.fetch_all(client.data.get_datasets, data_key="datasets")

# Get all companies as DataFrame
df = paginator.fetch_dataframe(
client.ontology.get_entities,
data_key="entities",
representation="company"
)

# Iterate memory-efficiently
for entity in paginator.iterate(client.ontology.get_entities, "entities"):
process(entity)

Page Size Recommendations

Use CaseRecommended Page Size
Quick preview / testing10-50
General data fetching100 (default)
Block data collection500-1000
Framework data export5000-10000
Memory-constrained50-100

Trade-offs

Larger page size:

  • Fewer API calls = faster overall
  • Less network overhead
  • More memory per request
  • Longer wait if a request fails

Smaller page size:

  • Lower memory usage
  • Faster feedback / progress
  • Less data lost on failure
  • More API calls = slower overall

Quick Reference

Basic Pattern

all_data = []
page = 1

while True:
response = api_call(page=page, page_size=100)
data = response.get("data_key", [])

if not data:
break

all_data.extend(data)

if len(all_data) >= response.get("total", 0):
break

page += 1

Response Structure

{
"data": [...], // or "datasets", "entities", etc.
"total": 1500, // Total available records
"page": 1, // Current page
"page_size": 100 // Records per page
}

Helper Functions Summary

FunctionDescription
fetch_all_pages()Generic paginator
fetch_all_entities()Paginate ontology entities
fetch_all_framework_data()Paginate framework data
fetch_with_progress()With tqdm progress bar
fetch_all_with_retry()With automatic retry
paginate_generator()Memory-efficient generator
CarbonArcPaginatorAll-in-one class

Next Steps