diff --git a/NEWS.md b/NEWS.md index 008592c..cff58a9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# Query.jl v1.1.0 Release Notes +* Add @pivot_wider and @pivot_longer + # Query.jl v0.12.2 Release Notes * Fix some bugs in the @select macro diff --git a/Project.toml b/Project.toml index dca75c1..80c52f2 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" [compat] IterableTables = "0.8.2, 0.9, 0.10, 0.11, 1" julia = "1.10" -QueryOperators = "0.9.1" +QueryOperators = "1" DataValues = "0.4.4" MacroTools = "0.4.4, 0.5" diff --git a/docs/src/standalonequerycommands.md b/docs/src/standalonequerycommands.md index ba9a92a..54861a2 100644 --- a/docs/src/standalonequerycommands.md +++ b/docs/src/standalonequerycommands.md @@ -530,3 +530,78 @@ println(q) 2 │ 2 Unknown 3 │ 0 Three ``` + +## The `@pivot_longer` command + +The `@pivot_longer` command reshapes data from wide format to long format. Each row in the source is expanded into one output row per pivot column. Non-pivot columns are retained as-is, and two new columns are added: `:variable` (holding the original column name as a `Symbol`) and `:value` (holding the cell value). + +Columns to pivot are selected with the same rich selector syntax as `@select`: + +| Syntax | Meaning | +|---------------------------|-----------------------------------------------------| +| `:col` | Include column by name | +| `startswith("prefix")` | Include columns whose name starts with `"prefix"` | +| `endswith("suffix")` | Include columns whose name ends with `"suffix"` | +| `occursin("sub")` | Include columns whose name contains `"sub"` | +| `!(startswith("prefix"))` | Exclude columns whose name starts with `"prefix"` | +| `-(startswith("prefix"))` | Same as above | +| `-:col` | Exclude column by name | +| `n` (integer) | Include column at position `n` | +| `:from::to` | Include a name range (inclusive) | +| `a:b` (integers) | Include a positional range (inclusive) | + +When only exclusion selectors are given (all starting with `-` or `!`), the starting set is all columns and the exclusions are removed. + +The names of the output columns can be customised with the `names_to` and `values_to` keyword arguments. Both accept a `Symbol` and default to `:variable` and `:value` respectively. + +#### Examples + +```julia +using Query, DataFrames + +df = DataFrame(year=[2017,2018], US=[1,3], EU=[2,4]) + +# Explicit column names +result = df |> @pivot_longer(:US, :EU) |> DataFrame +# 4×3 DataFrame: year | variable | value + +# Custom output column names +result = df |> @pivot_longer(:US, :EU, names_to=:country, values_to=:sales) |> DataFrame +# 4×3 DataFrame: year | country | sales + +# Predicate — pivot all columns starting with "U" +result = df |> @pivot_longer(startswith("U")) |> DataFrame + +# Predicate with exclusion — pivot wk* columns except wk_total +df2 = DataFrame(id=[1,2], wk1=[10,20], wk2=[30,40], wk_total=[40,60]) +result = df2 |> @pivot_longer(startswith("wk"), -:wk_total) |> DataFrame +# pivots :wk1 and :wk2 only + +# Negated predicate — pivot everything except id columns +result = df2 |> @pivot_longer(!(startswith("id"))) |> DataFrame +``` + +## The `@pivot_wider` command + +The `@pivot_wider` command reshapes data from long format to wide format. It has the form `source |> @pivot_wider(names_from, values_from)`, where `names_from` is the quoted name of the column whose values become new column names, and `values_from` is the quoted name of the column whose values populate those new columns. All other columns are used as identifier columns. Absent combinations are represented as `DataValues.DataValue{T}()` (NA). + +#### Example + +```julia +using Query, DataFrames + +long = DataFrame( + year = [2017, 2017, 2018, 2018], + country = [:US, :EU, :US, :EU], + value = [1, 2, 3, 4] +) + +result = long |> @pivot_wider(:country, :value) |> DataFrame + +# 2×3 DataFrame +# Row │ year US EU +# │ Int64 Union{Missing, Int64} Union{Missing, Int64} +# ─────┼────────────────────────────────────────────────── +# 1 │ 2017 1 2 +# 2 │ 2018 3 4 +``` diff --git a/src/Query.jl b/src/Query.jl index ff77807..7f61038 100644 --- a/src/Query.jl +++ b/src/Query.jl @@ -8,7 +8,8 @@ using QueryOperators export @from, @query, @count, Grouping, key export @map, @filter, @groupby, @orderby, @orderby_descending, @unique, - @thenby, @thenby_descending, @groupjoin, @join, @mapmany, @take, @drop + @thenby, @thenby_descending, @groupjoin, @join, @mapmany, @take, @drop, + @pivot_longer, @pivot_wider export @select, @rename, @mutate, @disallowna, @dropna, @replacena diff --git a/src/standalone_query_macros.jl b/src/standalone_query_macros.jl index 0655e8f..2576d01 100644 --- a/src/standalone_query_macros.jl +++ b/src/standalone_query_macros.jl @@ -249,3 +249,163 @@ macro unique(f) return :( i -> QueryOperators.unique(QueryOperators.query(i), $(esc(f_as_anonym_func)), $(esc(q)))) |> helper_namedtuples_replacement end + +# Returns true when a macro argument looks like a column selector (not a data source). +function _is_pivot_selector(arg) + arg isa QuoteNode && return true + arg isa Int && return true + # Keyword argument (names_to=:x, values_to=:x) — not a selector + if arg isa Expr && (arg.head == :(=) || arg.head == :kw) + return false + end + # Negative selector: -:col or -(pred(...)) + if arg isa Expr && arg.head == :call && length(arg.args) == 2 && arg.args[1] == :- + return true + end + # Logical NOT: !(pred(...)) + if arg isa Expr && arg.head == :call && length(arg.args) == 2 && arg.args[1] == :! + return true + end + # Predicate call: startswith("x"), endswith("x"), occursin("x") + if arg isa Expr && arg.head == :call && length(arg.args) == 2 && + arg.args[1] ∈ (:startswith, :endswith, :occursin) + return true + end + # Range: :a::b or 1:3 (parsed as Expr(:call, :(:), a, b)) + if arg isa Expr && arg.head == :call && length(arg.args) == 3 && + arg.args[1] == Symbol(":") + return true + end + # everything() + arg isa Expr && string(arg) == "everything()" && return true + return false +end + +# Converts a single selector AST argument into a (op, arg) instruction tuple. +function _pivot_selector_to_instruction(arg) + # :col — include by name + if arg isa QuoteNode + return (:include_name, arg.value) + end + # Positive integer — include by position + if arg isa Int && arg > 0 + return (:include_position, arg) + end + # Negative integer — exclude by position + if arg isa Int && arg < 0 + return (:exclude_position, -arg) + end + # everything() — include all + if arg isa Expr && string(arg) == "everything()" + return (:include_all, :_) + end + if arg isa Expr + # -:col or -(pred(...)) + if arg.head == :call && length(arg.args) == 2 && arg.args[1] == :- + inner = arg.args[2] + if inner isa QuoteNode + return (:exclude_name, inner.value) + elseif inner isa Expr && inner.head == :call && length(inner.args) == 2 && + inner.args[1] ∈ (:startswith, :endswith, :occursin) + fn, str = inner.args[1], inner.args[2] + str isa AbstractString || error("@pivot_longer: argument to $fn must be a string literal") + return (Symbol("exclude_$(fn)"), Symbol(str)) + end + end + # !(pred(...)) + if arg.head == :call && length(arg.args) == 2 && arg.args[1] == :! + inner = arg.args[2] + if inner isa Expr && inner.head == :call && length(inner.args) == 2 && + inner.args[1] ∈ (:startswith, :endswith, :occursin) + fn, str = inner.args[1], inner.args[2] + str isa AbstractString || error("@pivot_longer: argument to $fn must be a string literal") + return (Symbol("exclude_$(fn)"), Symbol(str)) + end + end + # startswith("x"), endswith("x"), occursin("x") + if arg.head == :call && length(arg.args) == 2 && + arg.args[1] ∈ (:startswith, :endswith, :occursin) + fn, str = arg.args[1], arg.args[2] + str isa AbstractString || error("@pivot_longer: argument to $fn must be a string literal") + return (Symbol("include_$(fn)"), Symbol(str)) + end + # Range: :a::b or 1:3 + if arg.head == :call && length(arg.args) == 3 && arg.args[1] == Symbol(":") + a, b = arg.args[2], arg.args[3] + if a isa Int && b isa Int + return (:include_range_idx, (a, b)) + elseif a isa QuoteNode && b isa QuoteNode + return (:include_range, (a.value, b.value)) + end + end + end + error("@pivot_longer: unrecognised selector argument: $arg") +end + +# Returns true when a macro argument is a keyword argument (name=value). +function _is_pivot_kwarg(arg) + arg isa Expr && (arg.head == :(=) || arg.head == :kw) && + length(arg.args) == 2 && arg.args[1] ∈ (:names_to, :values_to) +end + +macro pivot_longer(args...) + isempty(args) && error("@pivot_longer requires at least one column selector argument") + + # Detect pipe form vs direct form: + # pipe form — all args are selectors (first arg looks like a selector) + # direct form — first arg is the data source, rest are selectors + local source_expr, selector_args + if _is_pivot_selector(args[1]) + source_expr = nothing # will use `i` as the piped source + selector_args = args + else + source_expr = args[1] + selector_args = args[2:end] + isempty(selector_args) && error("@pivot_longer requires at least one column selector") + end + + # Separate keyword arguments (names_to=, values_to=) from column selectors + col_selectors = filter(a -> !_is_pivot_kwarg(a), selector_args) + kw_args = filter(_is_pivot_kwarg, selector_args) + isempty(col_selectors) && error("@pivot_longer requires at least one column selector") + + # Extract keyword values + kwargs_exprs = Expr[] + for kw in kw_args + name = kw.args[1] + val = kw.args[2] + push!(kwargs_exprs, Expr(:kw, name, esc(val))) + end + + # Build instruction tuple (evaluated at macro-expansion time) + instructions = Tuple(_pivot_selector_to_instruction(a) for a in col_selectors) + + # Generate the call expression + function make_call(src_expr) + call_expr = :(QueryOperators.pivot_longer( + $src_expr, + QueryOperators._resolve_pivot_cols(eltype($src_expr), Val($instructions)) + )) + if !isempty(kwargs_exprs) + # Insert keyword arguments into the function call + call_expr.args = [call_expr.args[1]; Expr(:parameters, kwargs_exprs...); call_expr.args[2:end]...] + end + call_expr + end + + if source_expr === nothing + call = make_call(:(QueryOperators.query(i))) + return :(i -> $call) + else + call = make_call(:(QueryOperators.query($(esc(source_expr))))) + return call + end +end + +macro pivot_wider(source, names_from, values_from) + return :(QueryOperators.pivot_wider(QueryOperators.query($(esc(source))), $(esc(names_from)), $(esc(values_from)))) +end + +macro pivot_wider(names_from, values_from) + return :(i -> QueryOperators.pivot_wider(QueryOperators.query(i), $(esc(names_from)), $(esc(values_from)))) +end diff --git a/test/runtests.jl b/test/runtests.jl index a6a8f91..3eaf905 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,6 +5,7 @@ include("test_core.jl") include("test_dplyr-syntax.jl") include("test_pipesyntax.jl") include("test_macros.jl") +include("test_standalone.jl") @run_package_tests diff --git a/test/test_standalone.jl b/test/test_standalone.jl index 1f5fdb0..cd2ccda 100644 --- a/test/test_standalone.jl +++ b/test/test_standalone.jl @@ -50,3 +50,119 @@ end @test df |> @unique() |> collect == [(a=1,b=3.), (a=2,b=3.)] @test df |> @unique(_.b) |> collect == [(a=1,b=3.)] end + +@testitem "@pivot_longer operator" begin + using DataFrames + + df = DataFrame(year=[2017,2018], US=[1,3], EU=[2,4]) + + # Pipe form + result = df |> @pivot_longer(:US, :EU) |> collect + @test length(result) == 4 + @test eltype(result) == NamedTuple{(:year, :variable, :value), Tuple{Int, Symbol, Int}} + @test result[1] == (year=2017, variable=:US, value=1) + @test result[2] == (year=2017, variable=:EU, value=2) + @test result[3] == (year=2018, variable=:US, value=3) + @test result[4] == (year=2018, variable=:EU, value=4) + + # Direct form + result2 = @pivot_longer(df, :US, :EU) |> collect + @test result2 == result + + # Collects into a DataFrame + df2 = df |> @pivot_longer(:US, :EU) |> DataFrame + @test df2 isa DataFrame + @test size(df2) == (4, 3) + @test names(df2) == ["year", "variable", "value"] + + # Custom output column names (pipe form) + result3 = df |> @pivot_longer(:US, :EU, names_to=:country, values_to=:sales) |> collect + @test length(result3) == 4 + @test fieldnames(eltype(result3)) == (:year, :country, :sales) + @test result3[1] == (year=2017, country=:US, sales=1) + @test result3[4] == (year=2018, country=:EU, sales=4) + + # Custom output column names (direct form) + result4 = @pivot_longer(df, :US, :EU, names_to=:country, values_to=:sales) |> collect + @test result4 == result3 + + # Only names_to (values_to defaults to :value) + result5 = df |> @pivot_longer(:US, :EU, names_to=:country) |> collect + @test fieldnames(eltype(result5)) == (:year, :country, :value) + + # Only values_to (names_to defaults to :variable) + result6 = df |> @pivot_longer(:US, :EU, values_to=:amount) |> collect + @test fieldnames(eltype(result6)) == (:year, :variable, :amount) +end + +@testitem "@pivot_longer selector syntax" begin + using DataFrames + + # startswith selector + df = DataFrame(year=[2017,2018], wk1=[1,3], wk2=[2,4], total=[10,20]) + + result = df |> @pivot_longer(startswith("wk")) |> collect + @test length(result) == 4 + @test fieldnames(eltype(result)) == (:year, :total, :variable, :value) + @test result[1] == (year=2017, total=10, variable=:wk1, value=1) + @test result[2] == (year=2017, total=10, variable=:wk2, value=2) + + # endswith selector + df2 = DataFrame(sales_2017=[1,2], cost_2017=[3,4], sales_2018=[5,6]) + result2 = df2 |> @pivot_longer(endswith("2017")) |> collect + @test length(result2) == 4 + @test fieldnames(eltype(result2)) == (:sales_2018, :variable, :value) + + # occursin selector + result3 = df2 |> @pivot_longer(occursin("sales")) |> collect + @test length(result3) == 4 + @test fieldnames(eltype(result3)) == (:cost_2017, :variable, :value) + + # Explicit symbols still work (backward compat) + result4 = df |> @pivot_longer(:wk1, :wk2) |> collect + @test result4 == result + + # startswith + exclude by name + result5 = df |> @pivot_longer(startswith("wk"), -:wk2) |> collect + @test length(result5) == 2 + @test all(r.variable == :wk1 for r in result5) + + # Negated predicate !(startswith(...)) — "all except wk*" pivots :year and :total + result6 = df |> @pivot_longer(!(startswith("wk"))) |> collect + @test length(result6) == 4 # 2 non-wk cols × 2 rows + @test fieldnames(eltype(result6)) == (:wk1, :wk2, :variable, :value) + @test result6[1].variable == :year + + # Direct form with predicate + result7 = @pivot_longer(df, startswith("wk")) |> collect + @test result7 == result +end + +@testset "@pivot_wider operator" begin + long = DataFrame( + year = [2017, 2017, 2018, 2018], + country = [:US, :EU, :US, :EU], + value = [1, 2, 3, 4] + ) + + # Pipe form + result = long |> @pivot_wider(:country, :value) |> collect + @test length(result) == 2 + @test fieldnames(eltype(result)) == (:year, :US, :EU) + @test result[1].year == 2017 + @test result[1].US == DataValue(1) + @test result[1].EU == DataValue(2) + @test result[2].year == 2018 + @test result[2].US == DataValue(3) + @test result[2].EU == DataValue(4) + + # Direct form + result2 = @pivot_wider(long, :country, :value) |> collect + @test result2 == result + + # Collects into a DataFrame + df2 = long |> @pivot_wider(:country, :value) |> DataFrame + @test df2 isa DataFrame + @test size(df2) == (2, 3) + @test names(df2) == ["year", "US", "EU"] +end