Pagination
Handle large data assets efficiently with pagination in the Carbon Arc SDK.
How Pagination Works
Most Carbon Arc APIs return paginated responses:
{
"data": [...], // Current page of results
"total": 1500, // Total records available
"page": 1, // Current page number
"page_size": 100 // Records per page
}
To get all data, loop through pages until you've collected all records.
Basic Usage
Single Page
from carbonarc import CarbonArcClient
client = CarbonArcClient(host="https://api.carbonarc.co", token="YOUR_TOKEN")
# Get first page
response = client.data.get_datasets(page=1, page_size=10)
print(f"Page: {response.get('page', 1)}")
print(f"Total Available: {response.get('total', 'N/A')}")
print(f"Records on this page: {len(response.get('datasets', []))}")
Collect All Pages
all_datasets = []
page = 1
page_size = 50
while True:
response = client.data.get_datasets(page=page, page_size=page_size)
data = response.get("datasets", [])
# No more data? Done
if not data:
break
all_datasets.extend(data)
# Check if we've collected everything
total = response.get("total", 0)
if len(all_datasets) >= total:
break
page += 1
print(f"Collected {len(all_datasets)} total datasets")
Pagination by Endpoint
Different endpoints use different response keys:
| Endpoint | Data Key | Parameters |
|---|---|---|
client.data.get_datasets() | "datasets" | page, page_size |
client.data.get_data_dictionary() | "data_dictionary" or "fields" | varies |
client.data.get_data_sample() | "data" or "samples" | varies |
client.ontology.get_entities() | "entities" | page, page_size |
client.ontology.get_entities_for_insight() | "entities" | page, page_size |
client.explorer.get_framework_data() | "data" | page, page_size |
Helper Functions
Generic Paginator
def fetch_all_pages(fetch_func, data_key, page_size=100, max_pages=None, **kwargs):
"""
Fetch all pages from a paginated API endpoint.
Args:
fetch_func: API method to call
data_key: Key in response containing the data
page_size: Records per page
max_pages: Optional limit on pages to fetch
**kwargs: Additional arguments to pass to the API
Returns:
List of all records
"""
all_data = []
page = 1
while True:
response = fetch_func(page=page, page_size=page_size, **kwargs)
data = response.get(data_key, [])
if not data:
break
all_data.extend(data)
total = response.get("total", 0)
if total and len(all_data) >= total:
break
if max_pages and page >= max_pages:
break
page += 1
return all_data
# Usage
all_datasets = fetch_all_pages(client.data.get_datasets, "datasets")
all_entities = fetch_all_pages(
client.ontology.get_entities,
"entities",
representation="company"
)
Paginate Entities
def fetch_all_entities(client, representation="company", max_pages=None):
"""Fetch all entities of a given type."""
all_entities = []
page = 1
page_size = 100
while True:
response = client.ontology.get_entities(
representation=representation,
page=page,
page_size=page_size
)
entities = response.get("entities", [])
if not entities:
break
all_entities.extend(entities)
total = response.get("total", 0)
if total and len(all_entities) >= total:
break
if max_pages and page >= max_pages:
break
page += 1
return all_entities
# Get all companies
companies = fetch_all_entities(client, representation="company")
Paginate Framework Data
import pandas as pd
def fetch_all_framework_data(client, framework_id, page_size=10000):
"""
Fetch all data from a purchased framework.
Args:
client: CarbonArcClient instance
framework_id: ID of the purchased framework
page_size: Records per page (use larger for efficiency)
Returns:
pandas DataFrame with all data
"""
all_data = []
page = 1
while True:
response = client.explorer.get_framework_data(
framework_id=framework_id,
page=page,
page_size=page_size
)
data = response.get("data", [])
if not data:
break
all_data.extend(data)
total = response.get("total", 0)
pct = (len(all_data) / total * 100) if total else 0
print(f" Page {page}: {len(all_data):,}/{total:,} ({pct:.1f}%)")
if total and len(all_data) >= total:
break
page += 1
return pd.DataFrame(all_data)
# Usage
df = fetch_all_framework_data(client, "fw_abc123")
df.to_parquet("framework_data.parquet")
With Progress Bar
pip install tqdm
from tqdm import tqdm
import math
def fetch_with_progress(fetch_func, data_key, page_size=100, **kwargs):
"""Fetch all pages with a progress bar."""
# First call to get total
first_response = fetch_func(page=1, page_size=page_size, **kwargs)
total = first_response.get("total", 0)
first_data = first_response.get(data_key, [])
if not first_data:
return []
all_data = list(first_data)
if len(all_data) >= total:
return all_data
total_pages = math.ceil(total / page_size)
for page in tqdm(range(2, total_pages + 1), desc="Fetching", initial=1, total=total_pages):
response = fetch_func(page=page, page_size=page_size, **kwargs)
data = response.get(data_key, [])
if not data:
break
all_data.extend(data)
if len(all_data) >= total:
break
return all_data
# Usage
all_datasets = fetch_with_progress(client.data.get_datasets, "datasets")
Memory-Efficient Generator
For very large data assets, process records one at a time:
def paginate_generator(fetch_func, data_key, page_size=100, **kwargs):
"""
Generator that yields records one page at a time.
Memory efficient - doesn't load all data at once.
"""
page = 1
while True:
response = fetch_func(page=page, page_size=page_size, **kwargs)
data = response.get(data_key, [])
if not data:
break
for record in data:
yield record
total = response.get("total", 0)
if total and (page * page_size) >= total:
break
page += 1
# Process records one at a time
for dataset in paginate_generator(client.data.get_datasets, "datasets"):
print(dataset.get("dataset_id"))
With Retry Logic
Handle transient failures gracefully:
import time
def fetch_all_with_retry(fetch_func, data_key, page_size=100, max_retries=3, **kwargs):
"""Fetch all pages with automatic retry on failure."""
all_data = []
page = 1
while True:
# Retry loop
for attempt in range(max_retries):
try:
response = fetch_func(page=page, page_size=page_size, **kwargs)
break
except Exception as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Page {page} failed, retrying in {wait_time}s...")
time.sleep(wait_time)
else:
raise
data = response.get(data_key, [])
if not data:
break
all_data.extend(data)
total = response.get("total", 0)
if total and len(all_data) >= total:
break
page += 1
return all_data
Complete Paginator Class
All-in-one solution:
import pandas as pd
import time
class CarbonArcPaginator:
"""Reusable paginator for Carbon Arc API endpoints."""
def __init__(self, client, default_page_size=100, max_retries=3, show_progress=True):
self.client = client
self.default_page_size = default_page_size
self.max_retries = max_retries
self.show_progress = show_progress
def fetch_all(self, fetch_func, data_key, page_size=None, max_pages=None, **kwargs):
"""Fetch all pages and return combined list."""
page_size = page_size or self.default_page_size
all_data = []
page = 1
while True:
response = self._fetch_with_retry(fetch_func, page, page_size, **kwargs)
data = response.get(data_key, [])
if not data:
break
all_data.extend(data)
total = response.get("total", 0)
if self.show_progress and total:
pct = len(all_data) / total * 100
print(f" Progress: {len(all_data):,}/{total:,} ({pct:.1f}%)")
if total and len(all_data) >= total:
break
if max_pages and page >= max_pages:
break
page += 1
return all_data
def fetch_dataframe(self, fetch_func, data_key, **kwargs):
"""Fetch all pages and return as DataFrame."""
data = self.fetch_all(fetch_func, data_key, **kwargs)
return pd.DataFrame(data)
def iterate(self, fetch_func, data_key, page_size=None, **kwargs):
"""Generator that yields records one at a time."""
page_size = page_size or self.default_page_size
page = 1
while True:
response = self._fetch_with_retry(fetch_func, page, page_size, **kwargs)
data = response.get(data_key, [])
if not data:
break
for record in data:
yield record
total = response.get("total", 0)
if total and (page * page_size) >= total:
break
page += 1
def _fetch_with_retry(self, fetch_func, page, page_size, **kwargs):
"""Fetch a single page with retry logic."""
for attempt in range(self.max_retries):
try:
return fetch_func(page=page, page_size=page_size, **kwargs)
except Exception as e:
if attempt < self.max_retries - 1:
wait = 2 ** attempt
print(f"Retry {attempt + 1}/{self.max_retries} in {wait}s...")
time.sleep(wait)
else:
raise
# Usage
paginator = CarbonArcPaginator(client)
# Get all datasets as list
datasets = paginator.fetch_all(client.data.get_datasets, data_key="datasets")
# Get all companies as DataFrame
df = paginator.fetch_dataframe(
client.ontology.get_entities,
data_key="entities",
representation="company"
)
# Iterate memory-efficiently
for entity in paginator.iterate(client.ontology.get_entities, "entities"):
process(entity)
Page Size Recommendations
| Use Case | Recommended Page Size |
|---|---|
| Quick preview / testing | 10-50 |
| General data fetching | 100 (default) |
| Block data collection | 500-1000 |
| Framework data export | 5000-10000 |
| Memory-constrained | 50-100 |
Trade-offs
Larger page size:
- Fewer API calls = faster overall
- Less network overhead
- More memory per request
- Longer wait if a request fails
Smaller page size:
- Lower memory usage
- Faster feedback / progress
- Less data lost on failure
- More API calls = slower overall
Quick Reference
Basic Pattern
all_data = []
page = 1
while True:
response = api_call(page=page, page_size=100)
data = response.get("data_key", [])
if not data:
break
all_data.extend(data)
if len(all_data) >= response.get("total", 0):
break
page += 1
Response Structure
{
"data": [...], // or "datasets", "entities", etc.
"total": 1500, // Total available records
"page": 1, // Current page
"page_size": 100 // Records per page
}
Helper Functions Summary
| Function | Description |
|---|---|
fetch_all_pages() | Generic paginator |
fetch_all_entities() | Paginate ontology entities |
fetch_all_framework_data() | Paginate framework data |
fetch_with_progress() | With tqdm progress bar |
fetch_all_with_retry() | With automatic retry |
paginate_generator() | Memory-efficient generator |
CarbonArcPaginator | All-in-one class |
Next Steps
- Learn about Filters & Date Ranges for precise queries
- Explore Exporting Data to save your paginated results
- Implement robust Error Handling for production workflows