Skip to main content

Exporting Data

Export your Carbon Arc data to CSV, Excel, JSON, Parquet, and databases.

Setup

import pandas as pd
from carbonarc import CarbonArcClient
import os

client = CarbonArcClient(
host="https://api.carbonarc.co",
token="YOUR_API_TOKEN"
)

# Get some data to export
datasets = client.data.get_datasets()
df = pd.DataFrame(datasets.get("datasources", []))

Export Formats

CSV

Best for: Excel compatibility, simple data sharing, universal format.

# Basic export
df.to_csv("data_export.csv", index=False)

# With custom options
df.to_csv(
"data_export_custom.csv",
index=False,
encoding="utf-8-sig", # Excel-friendly UTF-8
date_format="%Y-%m-%d", # Consistent date format
float_format="%.2f", # 2 decimal places
na_rep="NULL" # Replace NaN with "NULL"
)

# Compressed (great for large datasets)
df.to_csv("data_export.csv.gz", index=False, compression="gzip")

Excel

Best for: Business users, formatted reports, multiple sheets.

pip install openpyxl
# Basic export
df.to_excel("data_export.xlsx", index=False, sheet_name="Data")

# Multiple sheets in one workbook
with pd.ExcelWriter("multi_sheet_export.xlsx", engine="openpyxl") as writer:
df.to_excel(writer, sheet_name="All Data", index=False)
df.head(5).to_excel(writer, sheet_name="Sample", index=False)

# Summary sheet
summary = pd.DataFrame({
"Metric": ["Total Rows", "Total Columns", "Export Date"],
"Value": [len(df), len(df.columns), pd.Timestamp.now().strftime("%Y-%m-%d")]
})
summary.to_excel(writer, sheet_name="Summary", index=False)

JSON

Best for: APIs, web apps, nested data, programmatic access.

import json

# Records format (list of dicts) - most common
df.to_json("data_export.json", orient="records", indent=2)

# Other orientations
df.to_json("data_columns.json", orient="columns", indent=2) # Column-based
df.to_json("data_index.json", orient="index", indent=2) # Row index as keys

# JSON Lines (one object per line) - great for streaming
df.to_json("data_export.jsonl", orient="records", lines=True)

Parquet

Best for: Large data assets, data pipelines, analytics, cloud storage.

pip install pyarrow
# Basic export (highly compressed, preserves types)
df.to_parquet("data_export.parquet", index=False)

# With compression options
df.to_parquet("data_snappy.parquet", compression="snappy") # Fast
df.to_parquet("data_gzip.parquet", compression="gzip") # Smaller

# Partitioned (for very large datasets)
df.to_parquet("data_partitioned/", partition_cols=["column_name"])
Parquet Benefits
  • 10-100x smaller than CSV
  • Preserves data types (dates, numbers, etc.)
  • Columnar format = fast queries
  • Native support in Spark, Athena, BigQuery

Database Exports

SQLite (Local)

Best for: Local analysis, prototyping, SQL queries.

import sqlite3

conn = sqlite3.connect("carbonarc_data.db")

df.to_sql(
"datasets", # Table name
conn,
if_exists="replace", # Options: "fail", "replace", "append"
index=False
)

# Verify
result = pd.read_sql("SELECT COUNT(*) as count FROM datasets", conn)
print(f"Rows: {result['count'].iloc[0]}")

conn.close()

PostgreSQL

pip install sqlalchemy psycopg2-binary
from sqlalchemy import create_engine

def export_to_postgres(df, table_name, connection_string):
"""
Export DataFrame to PostgreSQL.

connection_string: postgresql://user:password@host:port/database
"""
engine = create_engine(connection_string)
df.to_sql(table_name, engine, if_exists="replace", index=False)
print(f"Exported {len(df)} rows to {table_name}")

# Usage
export_to_postgres(
df,
"carbonarc_data",
"postgresql://user:password@localhost:5432/mydb"
)

MySQL

pip install sqlalchemy pymysql
from sqlalchemy import create_engine

def export_to_mysql(df, table_name, connection_string):
"""
Export DataFrame to MySQL.

connection_string: mysql+pymysql://user:password@host:port/database
"""
engine = create_engine(connection_string)
df.to_sql(table_name, engine, if_exists="replace", index=False)
print(f"Exported {len(df)} rows to {table_name}")

# Usage
export_to_mysql(
df,
"carbonarc_data",
"mysql+pymysql://user:password@localhost:3306/mydb"
)

Cloud Storage

AWS S3

pip install s3fs boto3
def export_to_s3(df, bucket, key, file_format="parquet"):
"""Export DataFrame to AWS S3."""
s3_path = f"s3://{bucket}/{key}"

if file_format == "parquet":
df.to_parquet(s3_path, index=False)
elif file_format == "csv":
df.to_csv(s3_path, index=False)

print(f"Exported to {s3_path}")

# Usage
export_to_s3(df, "my-bucket", "data/carbonarc/export.parquet")

Google Cloud Storage

pip install gcsfs
def export_to_gcs(df, bucket, key, file_format="parquet"):
"""Export DataFrame to Google Cloud Storage."""
gcs_path = f"gs://{bucket}/{key}"

if file_format == "parquet":
df.to_parquet(gcs_path, index=False)
elif file_format == "csv":
df.to_csv(gcs_path, index=False)

print(f"Exported to {gcs_path}")

# Usage
export_to_gcs(df, "my-bucket", "data/carbonarc/export.parquet")

Azure Blob Storage

pip install adlfs
def export_to_azure(df, container, blob_path, file_format="parquet"):
"""Export DataFrame to Azure Blob Storage."""
azure_path = f"az://{container}/{blob_path}"

if file_format == "parquet":
df.to_parquet(azure_path, index=False)
elif file_format == "csv":
df.to_csv(azure_path, index=False)

print(f"Exported to {azure_path}")

# Usage
export_to_azure(df, "my-container", "data/carbonarc/export.parquet")

Framework Data Export

Export Single Framework

def export_framework_data(client, framework_id, output_path, file_format="csv"):
"""
Export purchased framework data to file.

Args:
client: CarbonArcClient instance
framework_id: ID of the purchased framework
output_path: Output file path (without extension)
file_format: "csv", "excel", "parquet", or "json"

Returns:
pandas DataFrame
"""
# Get framework data (handles pagination)
all_data = []
page = 1

while True:
response = client.explorer.get_framework_data(
framework_id=framework_id,
page=page,
page_size=10000
)

data = response.get("data", [])
if not data:
break

all_data.extend(data)

total = response.get("total", 0)
if len(all_data) >= total:
break
page += 1

df = pd.DataFrame(all_data)

# Export based on format
extensions = {"csv": ".csv", "excel": ".xlsx", "parquet": ".parquet", "json": ".json"}
path = f"{output_path}{extensions[file_format]}"

if file_format == "csv":
df.to_csv(path, index=False)
elif file_format == "excel":
df.to_excel(path, index=False)
elif file_format == "parquet":
df.to_parquet(path, index=False)
elif file_format == "json":
df.to_json(path, orient="records", indent=2)

print(f"Exported {len(df)} rows to {path}")
return df

# Usage
df = export_framework_data(
client,
framework_id="fw_abc123",
output_path="exports/tesla_revenue",
file_format="parquet"
)

Batch Export (Multiple Frameworks)

def batch_export_frameworks(client, framework_ids, output_dir="exports", file_format="parquet"):
"""Export multiple frameworks to separate files."""
import os
os.makedirs(output_dir, exist_ok=True)

results = {}

for fw_id in framework_ids:
try:
metadata = client.explorer.get_framework_metadata(framework_id=fw_id)
name = metadata.get("name", fw_id).replace(" ", "_").lower()

output_path = os.path.join(output_dir, name)
df = export_framework_data(client, fw_id, output_path, file_format)
results[fw_id] = df

except Exception as e:
print(f"Failed to export {fw_id}: {e}")
results[fw_id] = None

success_count = len([r for r in results.values() if r is not None])
print(f"\nBatch export: {success_count}/{len(framework_ids)} successful")
return results

# Usage
results = batch_export_frameworks(
client,
framework_ids=["fw_123", "fw_456", "fw_789"],
output_dir="exports/batch_2024",
file_format="parquet"
)

Scheduled Exports

from datetime import datetime

def scheduled_export(client, framework_id, output_dir="exports"):
"""
Export with timestamp for scheduled runs.
Creates: exports/framework_2024-01-15_143022.parquet
"""
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")

# Get all data with pagination
all_data = []
page = 1
while True:
response = client.explorer.get_framework_data(
framework_id=framework_id,
page=page,
page_size=10000
)
data = response.get("data", [])
if not data:
break
all_data.extend(data)
if len(all_data) >= response.get("total", 0):
break
page += 1

df = pd.DataFrame(all_data)

# Export with timestamp
os.makedirs(output_dir, exist_ok=True)
filename = f"{output_dir}/framework_{timestamp}.parquet"
df.to_parquet(filename, index=False)

print(f"Exported: {filename} ({len(df)} rows)")
return filename

Cron example:

# Run daily at 6 AM
0 6 * * * python -c "from export_script import scheduled_export; scheduled_export(...)"

Quick Reference

Format Comparison

FormatSizeSpeedBest For
CSVLargeFastExcel, universal sharing
ExcelLargeMediumBusiness users, reports
JSONLargeFastAPIs, web apps
ParquetSmallFastAnalytics, data lakes
SQLiteMediumMediumLocal SQL queries

Quick Commands

# CSV
df.to_csv("data.csv", index=False)

# Excel
df.to_excel("data.xlsx", index=False)

# JSON
df.to_json("data.json", orient="records", indent=2)

# Parquet
df.to_parquet("data.parquet", index=False)

# SQLite
df.to_sql("table", conn, if_exists="replace", index=False)

Compression

# CSV with gzip
df.to_csv("data.csv.gz", compression="gzip")

# Parquet with snappy
df.to_parquet("data.parquet", compression="snappy")

Required Packages

# Core
pip install pandas openpyxl pyarrow

# Databases
pip install sqlalchemy psycopg2-binary # PostgreSQL
pip install sqlalchemy pymysql # MySQL

# Cloud Storage
pip install s3fs boto3 # AWS S3
pip install gcsfs # Google Cloud Storage
pip install adlfs # Azure Blob Storage

Next Steps