#dataframe #statistics #pandas #analytics #data-analytics #result

bin+lib pandrs

A Rust implementation of pandas-like DataFrame for data analysis

1 unstable release

new 0.1.0-alpha.1 Apr 18, 2025

#742 in Math

Apache-2.0

1MB
21K SLoC

PandRS

Rust CI License: Apache-2.0 Crate

A DataFrame library for data analysis implemented in Rust. It has features and design inspired by Python's pandas library, combining fast data processing with type safety.

Key Features

  • Efficient data processing with high-performance column-oriented storage
  • Low memory footprint with categorical data and string pool optimization
  • Multi-core utilization through parallel processing
  • Optimization with lazy evaluation system
  • Thread-safe implementation
  • Robustness leveraging Rust's type safety and ownership system
  • Modularized design (implementation divided by functionality)
  • Python integration (PyO3 bindings)

Features

  • Series (1-dimensional array) and DataFrame (2-dimensional table) data structures
  • Support for missing values (NA)
  • Grouping and aggregation operations
  • Row labels with indexes
  • Multi-level indexes (hierarchical indexes)
  • CSV/JSON reading and writing
  • Parquet data format support
  • Basic operations (filtering, sorting, joining, etc.)
  • Aggregation functions for numeric data
  • Special operations for string data
  • Basic time series data processing
  • Categorical data types (efficient memory use, ordered categories)
  • Pivot tables
  • Visualization with text-based and high-quality graphs
  • Parallel processing support
  • Statistical analysis functions (descriptive statistics, t-tests, regression analysis, etc.)
  • Machine learning evaluation metrics (MSE, R², accuracy, F1, etc.)
  • Optimized implementation (column-oriented storage, lazy evaluation, string pool)
  • High-performance split implementation (sub-modularized files for each functionality)

Usage Examples

Creating and Basic Operations with DataFrames

use pandrs::{DataFrame, Series};

// Create series
let ages = Series::new(vec![30, 25, 40], Some("age".to_string()))?;
let heights = Series::new(vec![180, 175, 182], Some("height".to_string()))?;

// Add series to DataFrame
let mut df = DataFrame::new();
df.add_column("age".to_string(), ages)?;
df.add_column("height".to_string(), heights)?;

// Save as CSV
df.to_csv("data.csv")?;

// Load DataFrame from CSV
let df_from_csv = DataFrame::from_csv("data.csv", true)?;

Numeric Operations

// Create numeric series
let numbers = Series::new(vec![10, 20, 30, 40, 50], Some("values".to_string()))?;

// Statistical calculations
let sum = numbers.sum();         // 150
let mean = numbers.mean()?;      // 30
let min = numbers.min()?;        // 10
let max = numbers.max()?;        // 50

Installation

Add the following to your Cargo.toml:

[dependencies]
pandrs = "0.1.0-alpha.1"

Working with Missing Values (NA)

// Create series with NA values
let data = vec![
    NA::Value(10), 
    NA::Value(20), 
    NA::NA,  // missing value
    NA::Value(40)
];
let series = NASeries::new(data, Some("values".to_string()))?;

// Handle NA values
println!("Number of NAs: {}", series.na_count());
println!("Number of values: {}", series.value_count());

// Drop and fill NA values
let dropped = series.dropna()?;
let filled = series.fillna(0)?;

Group Operations

// Data and group keys
let values = Series::new(vec![10, 20, 15, 30, 25], Some("values".to_string()))?;
let keys = vec!["A", "B", "A", "C", "B"];

// Group and aggregate
let group_by = GroupBy::new(
    keys.iter().map(|s| s.to_string()).collect(),
    &values,
    Some("by_category".to_string())
)?;

// Aggregation results
let sums = group_by.sum()?;
let means = group_by.mean()?;

Time Series Operations

use pandrs::temporal::{TimeSeries, date_range, Frequency};
use chrono::NaiveDate;

// Generate date range
let dates = date_range(
    NaiveDate::from_str("2023-01-01")?,
    NaiveDate::from_str("2023-01-31")?,
    Frequency::Daily,
    true
)?;

// Create time series data
let time_series = TimeSeries::new(values, dates, Some("daily_data".to_string()))?;

// Time filtering
let filtered = time_series.filter_by_time(
    &NaiveDate::from_str("2023-01-10")?,
    &NaiveDate::from_str("2023-01-20")?
)?;

// Calculate moving average
let moving_avg = time_series.rolling_mean(3)?;

// Resampling (convert to weekly)
let weekly = time_series.resample(Frequency::Weekly).mean()?;

Statistical Analysis and Machine Learning Evaluation Functions

use pandrs::{DataFrame, Series, stats};
use pandrs::ml::metrics::regression::{mean_squared_error, r2_score};
use pandrs::ml::metrics::classification::{accuracy_score, f1_score};

// Descriptive statistics
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let stats_summary = stats::describe(&data)?;
println!("Mean: {}, Standard deviation: {}", stats_summary.mean, stats_summary.std);
println!("Median: {}, Quartiles: {} - {}", stats_summary.median, stats_summary.q1, stats_summary.q3);

// Calculate correlation coefficient
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![2.0, 3.0, 4.0, 5.0, 6.0];
let correlation = stats::correlation(&x, &y)?;
println!("Correlation coefficient: {}", correlation);

// Run t-test
let sample1 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let sample2 = vec![2.0, 3.0, 4.0, 5.0, 6.0];
let alpha = 0.05; // significance level
let result = stats::ttest(&sample1, &sample2, alpha, true)?;
println!("t-statistic: {}, p-value: {}", result.statistic, result.pvalue);
println!("Significant difference: {}", result.significant);

// Regression analysis
let mut df = DataFrame::new();
df.add_column("x1".to_string(), Series::new(vec![1.0, 2.0, 3.0, 4.0, 5.0], Some("x1".to_string()))?)?;
df.add_column("x2".to_string(), Series::new(vec![2.0, 3.0, 4.0, 5.0, 6.0], Some("x2".to_string()))?)?;
df.add_column("y".to_string(), Series::new(vec![3.0, 5.0, 7.0, 9.0, 11.0], Some("y".to_string()))?)?;

let model = stats::linear_regression(&df, "y", &["x1", "x2"])?;
println!("Coefficients: {:?}", model.coefficients());
println!("Coefficient of determination: {}", model.r_squared());

// Machine learning model evaluation - regression metrics
let y_true = vec![3.0, 5.0, 2.5, 7.0, 10.0];
let y_pred = vec![2.8, 4.8, 2.7, 7.2, 9.8];

let mse = mean_squared_error(&y_true, &y_pred)?;
let r2 = r2_score(&y_true, &y_pred)?;
println!("MSE: {:.4}, R²: {:.4}", mse, r2);

// Machine learning model evaluation - classification metrics
let true_labels = vec![true, false, true, true, false, false];
let pred_labels = vec![true, false, false, true, true, false];

let accuracy = accuracy_score(&true_labels, &pred_labels)?;
let f1 = f1_score(&true_labels, &pred_labels)?;
println!("Accuracy: {:.2}, F1 Score: {:.2}", accuracy, f1);

Pivot Tables and Grouping

use pandrs::pivot::AggFunction;

// Grouping and aggregation
let grouped = df.groupby("category")?;
let category_sum = grouped.sum(&["sales"])?;

// Pivot table
let pivot_result = df.pivot_table(
    "category",   // index column
    "region",     // column column
    "sales",      // value column
    AggFunction::Sum
)?;

Development Plan and Implementation Status

  • Basic DataFrame structure
  • Series implementation
  • Index functionality
  • CSV input/output
  • JSON input/output
  • Parquet format support
  • Missing value handling
  • Grouping operations
  • Time series data support
    • Date range generation
    • Time filtering
    • Moving average calculation
    • Frequency conversion (resampling)
  • Pivot tables
  • Complete implementation of join operations
    • Inner join (internal match)
    • Left join (left side priority)
    • Right join (right side priority)
    • Outer join (all rows)
  • Visualization functionality integration
    • Line graphs
    • Scatter plots
    • Text plot output
  • Parallel processing support
    • Parallel conversion of Series/NASeries
    • Parallel processing of DataFrames
    • Parallel filtering (1.15x speedup)
    • Parallel aggregation (3.91x speedup)
    • Parallel computation processing (1.37x speedup)
    • Adaptive parallel processing (automatic selection based on data size)
  • Enhanced visualization
    • Text-based plots with textplots (line, scatter)
    • High-quality graph output with plotters (PNG, SVG format)
    • Various graph types (line, scatter, bar, histogram, area)
    • Graph customization options (size, color, grid, legend)
    • Intuitive plot API for Series, DataFrame
  • Multi-level indexes
    • Hierarchical index structure
    • Data grouping by multiple levels
    • Level operations (swap, select)
  • Categorical data types
    • Memory-efficient encoding
    • Support for ordered and unordered categories
    • Complete integration with NA values (missing values)
  • Advanced DataFrame operations
    • Long-form and wide-form conversion (melt, stack, unstack)
    • Conditional aggregation
    • DataFrame concatenation
  • Memory usage optimization
    • String pool optimization (up to 89.8% memory reduction)
    • Categorical encoding (2.59x performance improvement)
    • Global string pool implementation
    • Improved memory locality with column-oriented storage
  • Python bindings
    • Python module with PyO3
    • Interoperability with numpy and pandas
    • Jupyter Notebook support
    • Speedup with string pool optimization (up to 3.33x)
  • Lazy evaluation system
    • Operation optimization with computation graph
    • Operation fusion
    • Avoiding unnecessary intermediate results
  • Statistical analysis features
    • Descriptive statistics (mean, standard deviation, quantiles, etc.)
    • Correlation coefficient and covariance
    • Hypothesis testing (t-test)
    • Regression analysis (simple and multiple regression)
    • Sampling methods (bootstrap, etc.)
  • Machine learning evaluation metrics
    • Regression evaluation (MSE, MAE, RMSE, R² score)
    • Classification evaluation (accuracy, precision, recall, F1 score)
  • Codebase maintainability improvements
    • File separation of OptimizedDataFrame by functionality
    • API compatibility maintained through re-exports
    • Independent implementation of ML metrics module

Multi-level Index Operations

use pandrs::{DataFrame, MultiIndex};

// Create MultiIndex from tuples
let tuples = vec![
    vec!["A".to_string(), "a".to_string()],
    vec!["A".to_string(), "b".to_string()],
    vec!["B".to_string(), "a".to_string()],
    vec!["B".to_string(), "b".to_string()],
];

// Set level names
let names = Some(vec![Some("first".to_string()), Some("second".to_string())]);
let multi_idx = MultiIndex::from_tuples(tuples, names)?;

// Create DataFrame with MultiIndex
let mut df = DataFrame::with_multi_index(multi_idx);

// Add data
let data = vec!["data1".to_string(), "data2".to_string(), "data3".to_string(), "data4".to_string()];
df.add_column("data".to_string(), pandrs::Series::new(data, Some("data".to_string()))?)?;

// Level operations
let level0_values = multi_idx.get_level_values(0)?;
let level1_values = multi_idx.get_level_values(1)?;

// Swap levels
let swapped_idx = multi_idx.swaplevel(0, 1)?;

Python Binding Usage Examples

import pandrs as pr
import numpy as np
import pandas as pd

# Create optimized DataFrame
df = pr.OptimizedDataFrame()
df.add_int_column('A', [1, 2, 3, 4, 5])
df.add_string_column('B', ['a', 'b', 'c', 'd', 'e'])
df.add_float_column('C', [1.1, 2.2, 3.3, 4.4, 5.5])

# Traditional API compatible interface
df2 = pr.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [1.1, 2.2, 3.3, 4.4, 5.5]
})

# Interoperability with pandas
pd_df = df.to_pandas()  # Convert from PandRS to pandas DataFrame
pr_df = pr.OptimizedDataFrame.from_pandas(pd_df)  # Convert from pandas DataFrame to PandRS

# Using lazy evaluation
lazy_df = pr.LazyFrame(df)
result = lazy_df.filter('A').select(['B', 'C']).execute()

# Direct use of string pool
string_pool = pr.StringPool()
idx1 = string_pool.add("repeated_value")
idx2 = string_pool.add("repeated_value")  # Returns the same index
print(string_pool.get(idx1))  # Returns "repeated_value"

# CSV input/output
df.to_csv('data.csv')
df_loaded = pr.OptimizedDataFrame.read_csv('data.csv')

# NumPy integration
series = df['A']
np_array = series.to_numpy()

# Jupyter Notebook support
from pandrs.jupyter import display_dataframe
display_dataframe(df, max_rows=10, max_cols=5)

Performance Optimization Results

The implementation of optimized column-oriented storage and lazy evaluation system has achieved significant performance improvements:

Performance Comparison of Key Operations

Operation Traditional Implementation Optimized Implementation Speedup
Series/Column Creation 198.446ms 149.528ms 1.33x
DataFrame Creation (1 million rows) NA NA NA
Filtering 596.146ms 161.816ms 3.68x
Group Aggregation 544.384ms 107.837ms 5.05x

String Processing Optimization

Mode Processing Time vs Traditional Notes
Legacy Mode 596.50ms 1.00x Traditional implementation
Categorical Mode 230.11ms 2.59x Categorical optimization
Optimized Implementation 232.38ms 2.57x Optimizer selection

Parallel Processing Performance Improvements

Operation Serial Processing Parallel Processing Speedup
Group Aggregation 696.85ms 178.09ms 3.91x
Filtering 201.35ms 175.48ms 1.15x
Computation 15.41ms 11.23ms 1.37x

Python Bindings String Optimization

Data Size Unique Rate Without Pool With Pool Processing Speedup Memory Reduction
100,000 rows 1% (high duplication) 82ms 35ms 2.34x 88.6%
1,000,000 rows 1% (high duplication) 845ms 254ms 3.33x 89.8%

Recent Improvements

  • Column-Oriented Storage Engine

    • Type-specialized column implementation (Int64Column, Float64Column, StringColumn, BooleanColumn)
    • Improved cache efficiency through memory locality
    • Operation acceleration and parallel processing efficiency
  • String Processing Optimization

    • Elimination of duplicate strings with global string pool
    • String to index conversion with categorical encoding
    • Consistent API design and multiple optimization modes
  • Lazy Evaluation System Implementation

    • Operation pipelining with computation graph
    • Avoiding unnecessary intermediate results
    • Improved efficiency through operation fusion
  • Significant Parallel Processing Improvements

    • Efficient multi-threading with Rayon
    • Adaptive parallel processing (automatic selection based on data size)
    • Chunk processing optimization
  • Enhanced Python Integration

    • Efficient data conversion between Python and Rust with string pool optimization
    • Utilization of NumPy buffer protocol
    • Near zero-copy data access
    • Type-specialized Python API
  • Advanced DataFrame Operations

    • Complete implementation of long-form and wide-form conversion (melt, stack, unstack)
    • Enhanced conditional aggregation processing
    • Optimization of complex join operations
  • Enhanced Time Series Data Processing

    • Support for RFC3339 format date parsing
    • Complete implementation of advanced window operations
    • Support for complete format frequency specification (DAILY, WEEKLY, etc.)
  • Stability and Quality Improvements

    • Implementation of comprehensive test suite
    • Improved error handling and warning elimination
    • Enhanced documentation
    • Updated dependencies (Rust 2023 compatible)

High-Quality Visualization (Plotters Integration)

use pandrs::{DataFrame, Series};
use pandrs::vis::plotters_ext::{PlotSettings, PlotKind, OutputType};

// Create plot from a single Series
let values = vec![15.0, 23.5, 18.2, 29.8, 32.1, 28.5, 19.2];
let series = Series::new(values, Some("temperature_change".to_string()))?;

// Create line graph
let line_settings = PlotSettings {
    title: "Temperature Change Over Time".to_string(),
    x_label: "Time".to_string(),
    y_label: "Temperature (°C)".to_string(),
    plot_kind: PlotKind::Line,
    ..PlotSettings::default()
};
series.plotters_plot("temp_line.png", line_settings)?;

// Create histogram
let hist_settings = PlotSettings {
    title: "Histogram of Temperature Distribution".to_string(),
    plot_kind: PlotKind::Histogram,
    ..PlotSettings::default()
};
series.plotters_histogram("histogram.png", 5, hist_settings)?;

// Visualization using DataFrame
let mut df = DataFrame::new();
df.add_column("temperature".to_string(), series)?;
df.add_column("humidity".to_string(), 
    Series::new(vec![67.0, 72.3, 69.5, 58.2, 62.1, 71.5, 55.8], Some("humidity".to_string()))?)?;

// Scatter plot (relationship between temperature and humidity)
let xy_settings = PlotSettings {
    title: "Relationship Between Temperature and Humidity".to_string(),
    plot_kind: PlotKind::Scatter,
    output_type: OutputType::SVG,  // Output in SVG format
    ..PlotSettings::default()
};
df.plotters_xy("temperature", "humidity", "temp_humidity.svg", xy_settings)?;

// Multiple series line graph
let multi_settings = PlotSettings {
    title: "Weather Data Trends".to_string(),
    plot_kind: PlotKind::Line,
    ..PlotSettings::default()
};
df.plotters_multi(&["temperature", "humidity"], "multi_series.png", multi_settings)?;

Dependency Versions

Latest dependency versions (April 2024):

[dependencies]
num-traits = "0.2.19"        # Numeric trait support
thiserror = "2.0.12"          # Error handling
serde = { version = "1.0.219", features = ["derive"] }  # Serialization
serde_json = "1.0.114"       # JSON processing
chrono = "0.4.40"            # Date and time processing
regex = "1.10.2"             # Regular expressions
csv = "1.3.1"                # CSV processing
rayon = "1.9.0"              # Parallel processing
lazy_static = "1.5.0"        # Lazy initialization
rand = "0.9.0"               # Random number generation
tempfile = "3.8.1"           # Temporary files
textplots = "0.8.7"          # Text-based visualization
plotters = "0.3.7"          # High-quality visualization
chrono-tz = "0.10.3"         # Timezone processing
parquet = "54.3.1"           # Parquet file support
arrow = "54.3.1"             # Arrow format support

License

Available under the Apache License 2.0.

Dependencies

~76MB
~1.5M SLoC