LCM Logo
Programming

DuckDB SQL

DuckDB is an in-process analytical database that speaks standard SQL with a number of ergonomic extensions aimed at data science. Everything below runs unchanged in the DuckDB CLI, in a notebook, or in the rtemis live DuckDB console — no server, no setup.

DuckDB reads CSV, Parquet, JSON, and Arrow directly from a path or URL. You usually don't need to load data into a table first — you can query a file in place.

Loading data

You can query a file directly, or register it as a view or table.

sql
-- Query a file in place
SELECT * FROM 'data/penguins.csv' LIMIT 5;
SELECT * FROM 'data/penguins.parquet';
SELECT * FROM read_csv_auto('https://example.com/penguins.csv');

-- Persist into a table (materialized, faster for repeated queries)
CREATE TABLE penguins AS SELECT * FROM 'data/penguins.csv';

-- Or a view (no copy; re-reads the file each time)
CREATE VIEW penguins AS SELECT * FROM 'data/penguins.parquet';

The examples below assume a penguins table with columns species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex, and year.

Inspecting schema and types

sql
-- Column names, types, nullability
DESCRIBE penguins;          -- or: DESC penguins;
PRAGMA table_info('penguins');

-- List tables/views in the current database
SHOW TABLES;

-- The full SQL describing any query's result
DESCRIBE SELECT species, body_mass_g FROM penguins;

-- Per-column summary stats (count, nulls, min/max, approx quantiles, std)
SUMMARIZE penguins;

SUMMARIZE is the fastest way to get oriented: it returns count, null %, distinct count, min/max/avg, and approximate quartiles for every column in one shot.

Selecting columns

DuckDB extends SELECT with EXCLUDE, REPLACE, and column patterns — a big quality-of-life win.

sql
SELECT species, body_mass_g FROM penguins;

-- Everything except a few columns
SELECT * EXCLUDE (year, sex) FROM penguins;

-- Everything, but transform one column in place
SELECT * REPLACE (body_mass_g / 1000.0 AS body_mass_g) FROM penguins;

-- Select columns by pattern (regex) or by glob
SELECT COLUMNS('.*_mm$') FROM penguins;        -- all bill/flipper length columns
SELECT COLUMNS('bill_.*') FROM penguins;

-- Apply a function across many columns at once
SELECT MIN(COLUMNS('.*_mm$')) FROM penguins;   -- min of each *_mm column

Filtering rows

sql
SELECT * FROM penguins WHERE species = 'Gentoo';
SELECT * FROM penguins WHERE body_mass_g > 4000 AND sex = 'male';
SELECT * FROM penguins WHERE island IN ('Biscoe', 'Dream');
SELECT * FROM penguins WHERE bill_length_mm BETWEEN 40 AND 50;
SELECT * FROM penguins WHERE sex IS NULL;              -- missing values
SELECT * FROM penguins WHERE species LIKE 'Adel%';     -- pattern match
SELECT * FROM penguins WHERE species ILIKE 'adel%';    -- case-insensitive

-- Order and page
SELECT * FROM penguins ORDER BY body_mass_g DESC NULLS LAST LIMIT 10;
SELECT * FROM penguins ORDER BY body_mass_g DESC LIMIT 10 OFFSET 10;

-- Sample rows (useful on large data)
SELECT * FROM penguins USING SAMPLE 5%;
SELECT * FROM penguins USING SAMPLE 100 ROWS;

Counting and distinct values

sql
SELECT count(*) FROM penguins;                      -- total rows
SELECT count(body_mass_g) FROM penguins;            -- non-null values
SELECT count(DISTINCT species) FROM penguins;       -- distinct count

-- Frequency table, most common first
SELECT species, count(*) AS n
FROM penguins
GROUP BY species
ORDER BY n DESC;

-- Distinct combinations
SELECT DISTINCT species, island FROM penguins;

Summarizing and aggregating

Aggregate functions collapse rows; GROUP BY does it per group. GROUP BY ALL infers the grouping columns from the non-aggregated columns in the SELECT list.

sql
-- Whole-table summary
SELECT
  count(*)               AS n,
  avg(body_mass_g)       AS mean_mass,
  median(body_mass_g)    AS median_mass,
  stddev(body_mass_g)    AS sd_mass,
  min(body_mass_g)       AS min_mass,
  max(body_mass_g)       AS max_mass,
  quantile_cont(body_mass_g, 0.9) AS p90
FROM penguins;

-- Grouped summary; GROUP BY ALL = GROUP BY species, island
SELECT
  species,
  island,
  count(*)         AS n,
  avg(body_mass_g) AS mean_mass
FROM penguins
GROUP BY ALL
ORDER BY mean_mass DESC;

-- Filter on aggregates with HAVING
SELECT species, avg(body_mass_g) AS mean_mass
FROM penguins
GROUP BY species
HAVING count(*) > 50;

-- Conditional aggregation (counts per category in one row)
SELECT
  count(*) FILTER (WHERE sex = 'male')   AS n_male,
  count(*) FILTER (WHERE sex = 'female') AS n_female
FROM penguins;

Handy DuckDB aggregates for data science: mode(), mad(), corr(x, y), regr_slope(y, x), approx_count_distinct(), histogram(col), and list(col) / array_agg(col) to collect values into an array per group.

Window functions

Windows compute across rows without collapsing them — ranks, running totals, lags, group-relative values.

sql
SELECT
  species,
  body_mass_g,
  -- rank within each species, heaviest first
  rank() OVER (PARTITION BY species ORDER BY body_mass_g DESC) AS mass_rank,
  -- each penguin's mass relative to its species mean
  body_mass_g - avg(body_mass_g) OVER (PARTITION BY species)   AS mass_vs_mean,
  -- running count
  row_number() OVER (PARTITION BY species ORDER BY body_mass_g) AS rn
FROM penguins;

Reshaping: wide ⇄ long

DuckDB has first-class PIVOT and UNPIVOT.

sql
-- Long → wide: one column per island, mean mass in the cells
PIVOT penguins
ON island
USING avg(body_mass_g)
GROUP BY species;

-- Wide → long: collapse the three measurement columns into key/value rows
UNPIVOT penguins
ON bill_length_mm, bill_depth_mm, flipper_length_mm
INTO NAME measurement VALUE mm;

Joins

sql
-- Given a second table `islands(island, region)`
SELECT p.species, p.island, i.region
FROM penguins p
JOIN islands i ON p.island = i.island;      -- INNER (default)

SELECT p.species, i.region
FROM penguins p
LEFT JOIN islands i ON p.island = i.island; -- keep all penguins

-- USING when the key column shares a name
SELECT * FROM penguins JOIN islands USING (island);

-- Other join types: RIGHT JOIN, FULL JOIN, CROSS JOIN,
-- and ASOF JOIN for nearest-match on a sorted/time key.

CTEs and subqueries

Common Table Expressions (WITH) name intermediate results and keep queries readable — prefer them over deeply nested subqueries.

sql
WITH heavy AS (
  SELECT * FROM penguins WHERE body_mass_g > 4500
)
SELECT species, count(*) AS n
FROM heavy
GROUP BY species;

-- Chain multiple CTEs
WITH per_species AS (
  SELECT species, avg(body_mass_g) AS mean_mass
  FROM penguins GROUP BY species
),
ranked AS (
  SELECT *, rank() OVER (ORDER BY mean_mass DESC) AS r FROM per_species
)
SELECT * FROM ranked WHERE r <= 2;

Handling missing values

sql
SELECT
  coalesce(sex, 'unknown') AS sex,         -- replace NULL with a default
  count(*) AS n
FROM penguins
GROUP BY ALL;

-- Drop rows with any NULL in key columns
SELECT * FROM penguins
WHERE body_mass_g IS NOT NULL AND sex IS NOT NULL;

-- Mean imputation
SELECT
  coalesce(body_mass_g, avg(body_mass_g) OVER ()) AS body_mass_g
FROM penguins;

Types and casting

sql
SELECT
  body_mass_g::DOUBLE        AS mass_dbl,       -- :: shorthand
  CAST(year AS VARCHAR)      AS year_str,
  try_cast('abc' AS INTEGER) AS safe,           -- returns NULL instead of erroring
  '2026-06-15'::DATE         AS d;

-- Common types: BOOLEAN, INTEGER/BIGINT, DOUBLE/DECIMAL, VARCHAR,
-- DATE, TIMESTAMP, BLOB, plus nested LIST, STRUCT, and MAP.

Dates and strings

sql
SELECT
  date_trunc('month', '2026-06-15'::DATE)  AS month_start,
  date_part('year', current_date)          AS yr,
  current_date - INTERVAL 7 DAY            AS week_ago,
  strftime(current_date, '%Y-%m-%d')       AS formatted;

SELECT
  upper(species)                AS up,
  length(species)               AS len,
  regexp_replace(species, ' ', '_') AS slug,
  split_part('Biscoe-1', '-', 1)    AS prefix;

Exporting results

Any query result can be written back out — round-tripping between formats is a one-liner.

sql
COPY (SELECT * FROM penguins WHERE species = 'Gentoo')
TO 'gentoo.parquet' (FORMAT PARQUET);

COPY (SELECT species, avg(body_mass_g) AS mean_mass FROM penguins GROUP BY species)
TO 'summary.csv' (HEADER, DELIMITER ',');

A typical data-science workflow in the console: SUMMARIZE the file to understand it → SELECT … WHERE to filter → GROUP BY ALL to aggregate → PIVOT/JOIN to reshape → COPY … TO to export. Each step is a standalone query you can iterate on.

On this page