diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4cad8db24..692563019 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -62,7 +62,7 @@ jobs: uses: actions/cache@v5 with: path: ~/.cargo - key: cargo-cache-${{ steps.rust-toolchain.outputs.cachekey }}-${{ hashFiles('Cargo.lock') }} + key: cargo-cache-${{ matrix.toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Install dependencies uses: astral-sh/setup-uv@v7 @@ -106,7 +106,7 @@ jobs: RUST_BACKTRACE: 1 run: | git submodule update --init - uv run --no-project pytest -v . --import-mode=importlib + uv run --no-project pytest -v --import-mode=importlib - name: FFI unit tests run: | diff --git a/pyproject.toml b/pyproject.toml index 08d64eca0..b238e049e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,9 @@ features = ["substrait"] [tool.pytest.ini_options] asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "function" +addopts = "--doctest-modules" +doctest_optionflags = ["NORMALIZE_WHITESPACE", "ELLIPSIS"] +testpaths = ["python/tests", "python/datafusion"] # Enable docstring linting using the google style guide [tool.ruff.lint] diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index d302c12a5..5bd0eec2d 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -327,8 +327,9 @@ def into_view(self, temporary: bool = False) -> Table: >>> df = ctx.sql("SELECT 1 AS value") >>> view = df.into_view() >>> ctx.register_table("values_view", view) - >>> df.collect() # The DataFrame is still usable - >>> ctx.sql("SELECT value FROM values_view").collect() + >>> result = ctx.sql("SELECT value FROM values_view").collect() + >>> result[0].column("value").to_pylist() + [1] """ from datafusion.catalog import Table as _Table @@ -1389,9 +1390,12 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: DataFrame with null values replaced where type casting is possible Examples: - >>> df = df.fill_null(0) # Fill all nulls with 0 where possible - >>> # Fill nulls in specific string columns - >>> df = df.fill_null("missing", subset=["name", "category"]) + >>> from datafusion import SessionContext, col + >>> ctx = SessionContext() + >>> df = ctx.from_pydict({"a": [1, None, 3], "b": [None, 5, 6]}) + >>> filled = df.fill_null(0) + >>> filled.sort(col("a")).collect()[0].column("a").to_pylist() + [0, 1, 3] Notes: - Only fills nulls in columns where the value can be cast to the column type diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 431afcc30..12156b20d 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -291,7 +291,18 @@ def isnan(expr: Expr) -> Expr: - """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + """Returns true if a given number is +NaN or -NaN otherwise returns false. + + Examples: + --------- + >>> import datafusion as dfn + >>> import numpy as np + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, np.nan]}) + >>> result = df.select(dfn.functions.isnan(dfn.col("a")).alias("isnan")) + >>> result.collect_column("isnan")[1].as_py() + True + """ return Expr(f.isnan(expr.expr)) @@ -299,22 +310,69 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: """Returns NULL if expr1 equals expr2; otherwise it returns expr1. This can be used to perform the inverse operation of the COALESCE expression. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2], "b": [1, 3]}) + >>> result = df.select( + ... dfn.functions.nullif(dfn.col("a"), dfn.col("b")).alias("nullif")) + >>> result = result + >>> result.collect_column("nullif").to_pylist() + [None, 2] """ return Expr(f.nullif(expr1.expr, expr2.expr)) def encode(expr: Expr, encoding: Expr) -> Expr: - """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" + """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.encode(dfn.col("a"), dfn.lit("base64")).alias("enc")) + >>> result = result + >>> result.collect_column("enc")[0].as_py() + 'aGVsbG8' + """ return Expr(f.encode(expr.expr, encoding.expr)) def decode(expr: Expr, encoding: Expr) -> Expr: - """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" + """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["aGVsbG8="]}) + >>> result = df.select( + ... dfn.functions.decode(dfn.col("a"), dfn.lit("base64")).alias("dec")) + >>> result = result + >>> result.collect_column("dec")[0].as_py() + b'hello' + """ return Expr(f.decode(expr.expr, encoding.expr)) def array_to_string(expr: Expr, delimiter: Expr) -> Expr: - """Converts each element to its text representation.""" + """Converts each element to its text representation. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_to_string(dfn.col("a"), dfn.lit(",")).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + '1,2,3' + """ return Expr(f.array_to_string(expr.expr, delimiter.expr.cast(pa.string()))) @@ -322,6 +380,17 @@ def array_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. This is an alias for :py:func:`array_to_string`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_join(dfn.col("a"), dfn.lit("-")).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + '1-2-3' """ return array_to_string(expr, delimiter) @@ -330,6 +399,17 @@ def list_to_string(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. This is an alias for :py:func:`array_to_string`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[4, 5, 6]]}) + >>> result = df.select( + ... dfn.functions.list_to_string(dfn.col("a"), dfn.lit(",")).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + '4,5,6' """ return array_to_string(expr, delimiter) @@ -338,12 +418,35 @@ def list_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. This is an alias for :py:func:`array_to_string`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[7, 8, 9]]}) + >>> result = df.select( + ... dfn.functions.list_join(dfn.col("a"), dfn.lit("|")).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + '7|8|9' """ return array_to_string(expr, delimiter) def in_list(arg: Expr, values: list[Expr], negated: bool = False) -> Expr: - """Returns whether the argument is contained within the list ``values``.""" + """Returns whether the argument is contained within the list ``values``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.functions.in_list(dfn.col("a"), [dfn.lit(1), dfn.lit(3)]).alias("in")) + >>> result = result + >>> result.collect_column("in").to_pylist() + [True, False, True] + """ values = [v.expr for v in values] return Expr(f.in_list(arg.expr, values, negated)) @@ -353,6 +456,16 @@ def digest(value: Expr, method: Expr) -> Expr: Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, blake2b, and blake3. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.digest(dfn.col("a"), dfn.lit("md5")).alias("d")) + >>> len(result.collect_column("d")[0].as_py()) > 0 + True """ return Expr(f.digest(value.expr, method.expr)) @@ -361,6 +474,15 @@ def concat(*args: Expr) -> Expr: """Concatenates the text representations of all the arguments. NULL arguments are ignored. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"], "b": [" world"]}) + >>> result = df.select(dfn.functions.concat(dfn.col("a"), dfn.col("b")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 'hello world' """ args = [arg.expr for arg in args] return Expr(f.concat(args)) @@ -370,32 +492,66 @@ def concat_ws(separator: str, *args: Expr) -> Expr: """Concatenates the list ``args`` with the separator. ``NULL`` arguments are ignored. ``separator`` should not be ``NULL``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"], "b": ["world"]}) + >>> result = df.select( + ... dfn.functions.concat_ws("-", dfn.col("a"), dfn.col("b")).alias("c")) + >>> result = result + >>> result.collect_column("c")[0].as_py() + 'hello-world' """ args = [arg.expr for arg in args] return Expr(f.concat_ws(separator, args)) def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: - """Creates a new sort expression.""" + """Creates a new sort expression. + + Examples: + --------- + >>> import datafusion as dfn + >>> sort_expr = dfn.functions.order_by(dfn.col("a"), ascending=False) + >>> sort_expr.ascending() + False + """ return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) def alias(expr: Expr, name: str, metadata: dict[str, str] | None = None) -> Expr: """Creates an alias expression with an optional metadata dictionary. - Args: + Parameters: + ----------- expr: The expression to alias name: The alias name metadata: Optional metadata to attach to the column - Returns: - An expression with the given alias + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2]}) + >>> df.select(dfn.functions.alias(dfn.col("a"), "b")).collect_column("b")[0].as_py() + 1 """ return Expr(f.alias(expr.expr, name, metadata)) def col(name: str) -> Expr: - """Creates a column reference expression.""" + """Creates a column reference expression. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> df.select(dfn.functions.col("a")).collect_column("a")[0].as_py() + 1 + """ return Expr(f.col(name)) @@ -409,6 +565,15 @@ def count_star(filter: Expr | None = None) -> Expr: Args: filter: If provided, only count rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.count_star().alias("cnt")]) + >>> result.collect_column("cnt")[0].as_py() + 3 """ return count(Expr.literal(1), filter=filter) @@ -419,6 +584,17 @@ def case(expr: Expr) -> CaseBuilder: Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for detailed usage. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.functions.case(dfn.col("a")).when(dfn.lit(1), + ... dfn.lit("one")).otherwise(dfn.lit("other")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 'one' """ return CaseBuilder(f.case(expr.expr)) @@ -429,6 +605,17 @@ def when(when: Expr, then: Expr) -> CaseBuilder: Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for detailed usage. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.functions.when(dfn.col("a") > dfn.lit(2), + ... dfn.lit("big")).otherwise(dfn.lit("small")).alias("c")) + >>> result.collect_column("c")[2].as_py() + 'big' """ return CaseBuilder(f.when(when.expr, then.expr)) @@ -480,10 +667,14 @@ def window( def abs(arg: Expr) -> Expr: """Return the absolute value of a given number. - Returns: - -------- - Expr - A new expression representing the absolute value of the input expression. + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [-1, 0, 1]}) + >>> result = df.select(dfn.functions.abs(dfn.col("a")).alias("abs")) + >>> result.collect_column("abs")[0].as_py() + 1 """ return Expr(f.abs(arg.expr)) @@ -491,127 +682,376 @@ def abs(arg: Expr) -> Expr: def acos(arg: Expr) -> Expr: """Returns the arc cosine or inverse cosine of a number. - Returns: - -------- - Expr - A new expression representing the arc cosine of the input expression. + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.acos(dfn.col("a")).alias("acos")) + >>> result.collect_column("acos")[0].as_py() + 0.0 """ return Expr(f.acos(arg.expr)) def acosh(arg: Expr) -> Expr: - """Returns inverse hyperbolic cosine.""" + """Returns inverse hyperbolic cosine. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.acosh(dfn.col("a")).alias("acosh")) + >>> result.collect_column("acosh")[0].as_py() + 0.0 + """ return Expr(f.acosh(arg.expr)) def ascii(arg: Expr) -> Expr: - """Returns the numeric code of the first character of the argument.""" + """Returns the numeric code of the first character of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a","b","c"]}) + >>> ascii_df = df.select(dfn.functions.ascii(dfn.col("a")).alias("ascii")) + >>> ascii_df.collect_column("ascii")[0].as_py() + 97 + """ return Expr(f.ascii(arg.expr)) def asin(arg: Expr) -> Expr: - """Returns the arc sine or inverse sine of a number.""" + """Returns the arc sine or inverse sine of a number. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.asin(dfn.col("a")).alias("asin")) + >>> result.collect_column("asin")[0].as_py() + 0.0 + """ return Expr(f.asin(arg.expr)) def asinh(arg: Expr) -> Expr: - """Returns inverse hyperbolic sine.""" + """Returns inverse hyperbolic sine. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.asinh(dfn.col("a")).alias("asinh")) + >>> result.collect_column("asinh")[0].as_py() + 0.0 + """ return Expr(f.asinh(arg.expr)) def atan(arg: Expr) -> Expr: - """Returns inverse tangent of a number.""" + """Returns inverse tangent of a number. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.atan(dfn.col("a")).alias("atan")) + >>> result.collect_column("atan")[0].as_py() + 0.0 + """ return Expr(f.atan(arg.expr)) def atanh(arg: Expr) -> Expr: - """Returns inverse hyperbolic tangent.""" + """Returns inverse hyperbolic tangent. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.atanh(dfn.col("a")).alias("atanh")) + >>> result.collect_column("atanh")[0].as_py() + 0.0 + """ return Expr(f.atanh(arg.expr)) def atan2(y: Expr, x: Expr) -> Expr: - """Returns inverse tangent of a division given in the argument.""" + """Returns inverse tangent of a division given in the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [0.0], "x": [1.0]}) + >>> result = df.select( + ... dfn.functions.atan2(dfn.col("y"), dfn.col("x")).alias("atan2")) + >>> result = result + >>> result.collect_column("atan2")[0].as_py() + 0.0 + """ return Expr(f.atan2(y.expr, x.expr)) def bit_length(arg: Expr) -> Expr: - """Returns the number of bits in the string argument.""" + """Returns the number of bits in the string argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a","b","c"]}) + >>> bit_df = df.select(dfn.functions.bit_length(dfn.col("a")).alias("bit_len")) + >>> bit_df.collect_column("bit_len")[0].as_py() + 8 + """ return Expr(f.bit_length(arg.expr)) def btrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from both sides of a string.""" + """Removes all characters, spaces by default, from both sides of a string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.btrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + 'a' + """ return Expr(f.btrim(arg.expr)) def cbrt(arg: Expr) -> Expr: - """Returns the cube root of a number.""" + """Returns the cube root of a number. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [27]}) + >>> cbrt_df = df.select(dfn.functions.cbrt(dfn.col("a")).alias("cbrt")) + >>> cbrt_df.collect_column("cbrt")[0].as_py() + 3.0 + """ return Expr(f.cbrt(arg.expr)) def ceil(arg: Expr) -> Expr: - """Returns the nearest integer greater than or equal to argument.""" + """Returns the nearest integer greater than or equal to argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.9]}) + >>> floor_df = df.select(dfn.functions.ceil(dfn.col("a")).alias("ceil")) + >>> floor_df.collect_column("ceil")[0].as_py() + 2.0 + """ return Expr(f.ceil(arg.expr)) def character_length(arg: Expr) -> Expr: - """Returns the number of characters in the argument.""" + """Returns the number of characters in the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abc","b","c"]}) + >>> char_len_df = df.select( + ... dfn.functions.character_length(dfn.col("a")).alias("char_len")) + >>> char_len_df.collect_column("char_len")[0].as_py() + 3 + """ return Expr(f.character_length(arg.expr)) def length(string: Expr) -> Expr: - """The number of characters in the ``string``.""" + """The number of characters in the ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.length(string.expr)) def char_length(string: Expr) -> Expr: - """The number of characters in the ``string``.""" + """The number of characters in the ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.char_length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.char_length(string.expr)) def chr(arg: Expr) -> Expr: - """Converts the Unicode code point to a UTF8 character.""" + """Converts the Unicode code point to a UTF8 character. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [65]}) + >>> result = df.select(dfn.functions.chr(dfn.col("a")).alias("chr")) + >>> result.collect_column("chr")[0].as_py() + 'A' + """ return Expr(f.chr(arg.expr)) def coalesce(*args: Expr) -> Expr: - """Returns the value of the first expr in ``args`` which is not NULL.""" + """Returns the value of the first expr in ``args`` which is not NULL. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]}) + >>> result = df.select( + ... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c")) + >>> result = result + >>> result.collect_column("c")[0].as_py() + 2 + """ args = [arg.expr for arg in args] return Expr(f.coalesce(*args)) def cos(arg: Expr) -> Expr: - """Returns the cosine of the argument.""" + """Returns the cosine of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,-1,1]}) + >>> cos_df = df.select(dfn.functions.cos(dfn.col("a")).alias("cos")) + >>> cos_df.collect_column("cos")[0].as_py() + 1.0 + """ return Expr(f.cos(arg.expr)) def cosh(arg: Expr) -> Expr: - """Returns the hyperbolic cosine of the argument.""" + """Returns the hyperbolic cosine of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,-1,1]}) + >>> cosh_df = df.select(dfn.functions.cosh(dfn.col("a")).alias("cosh")) + >>> cosh_df.collect_column("cosh")[0].as_py() + 1.0 + """ return Expr(f.cosh(arg.expr)) def cot(arg: Expr) -> Expr: - """Returns the cotangent of the argument.""" + """Returns the cotangent of the argument. + + Examples: + --------- + >>> from math import pi + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [pi / 4]}) + >>> import builtins + >>> result = df.select( + ... dfn.functions.cot(dfn.col("a")).alias("cot") + ... ) + >>> builtins.round( + ... result.collect_column("cot")[0].as_py(), 1 + ... ) + 1.0 + """ return Expr(f.cot(arg.expr)) def degrees(arg: Expr) -> Expr: - """Converts the argument from radians to degrees.""" + """Converts the argument from radians to degrees. + + Examples: + --------- + >>> from math import pi + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,pi,2*pi]}) + >>> deg_df = df.select(dfn.functions.degrees(dfn.col("a")).alias("deg")) + >>> deg_df.collect_column("deg")[2].as_py() + 360.0 + """ return Expr(f.degrees(arg.expr)) def ends_with(arg: Expr, suffix: Expr) -> Expr: - """Returns true if the ``string`` ends with the ``suffix``, false otherwise.""" + """Returns true if the ``string`` ends with the ``suffix``, false otherwise. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abc","b","c"]}) + >>> ends_with_df = df.select( + ... dfn.functions.ends_with(dfn.col("a"), dfn.lit("c")).alias("ends_with")) + >>> ends_with_df.collect_column("ends_with")[0].as_py() + True + """ return Expr(f.ends_with(arg.expr, suffix.expr)) def exp(arg: Expr) -> Expr: - """Returns the exponential of the argument.""" + """Returns the exponential of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.exp(dfn.col("a")).alias("exp")) + >>> result.collect_column("exp")[0].as_py() + 1.0 + """ return Expr(f.exp(arg.expr)) def factorial(arg: Expr) -> Expr: - """Returns the factorial of the argument.""" + """Returns the factorial of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [3]}) + >>> result = df.select( + ... dfn.functions.factorial(dfn.col("a")).alias("factorial") + ... ) + >>> result.collect_column("factorial")[0].as_py() + 6 + """ return Expr(f.factorial(arg.expr)) @@ -622,17 +1062,48 @@ def find_in_set(string: Expr, string_list: Expr) -> Expr: ``string_list`` consisting of N substrings. The string list is a string composed of substrings separated by ``,`` characters. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["b"]}) + >>> result = df.select( + ... dfn.functions.find_in_set(dfn.col("a"), dfn.lit("a,b,c")).alias("pos")) + >>> result = result + >>> result.collect_column("pos")[0].as_py() + 2 """ return Expr(f.find_in_set(string.expr, string_list.expr)) def floor(arg: Expr) -> Expr: - """Returns the nearest integer less than or equal to the argument.""" + """Returns the nearest integer less than or equal to the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.9]}) + >>> floor_df = df.select(dfn.functions.floor(dfn.col("a")).alias("floor")) + >>> floor_df.collect_column("floor")[0].as_py() + 1.0 + """ return Expr(f.floor(arg.expr)) def gcd(x: Expr, y: Expr) -> Expr: - """Returns the greatest common divisor.""" + """Returns the greatest common divisor. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [12], "b": [8]}) + >>> result = df.select(dfn.functions.gcd(dfn.col("a"), dfn.col("b")).alias("gcd")) + >>> result.collect_column("gcd")[0].as_py() + 4 + """ return Expr(f.gcd(x.expr, y.expr)) @@ -641,6 +1112,15 @@ def initcap(string: Expr) -> Expr: Converts the first letter of each word in ``string`` to uppercase and the remaining characters to lowercase. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat"]}) + >>> cap_df = df.select(dfn.functions.initcap(dfn.col("a")).alias("cap")) + >>> cap_df.collect_column("cap")[0].as_py() + 'The Cat' """ return Expr(f.initcap(string.expr)) @@ -649,52 +1129,155 @@ def instr(string: Expr, substring: Expr) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. This is an alias for :py:func:`strpos`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.instr(dfn.col("a"), dfn.lit("world")).alias("pos")) + >>> result = result + >>> result.collect_column("pos")[0].as_py() + 7 """ return strpos(string, substring) def iszero(arg: Expr) -> Expr: - """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + """Returns true if a given number is +0.0 or -0.0 otherwise returns false. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0, 1.0]}) + >>> result = df.select(dfn.functions.iszero(dfn.col("a")).alias("iz")) + >>> result.collect_column("iz")[0].as_py() + True + """ return Expr(f.iszero(arg.expr)) def lcm(x: Expr, y: Expr) -> Expr: - """Returns the least common multiple.""" + """Returns the least common multiple. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [4], "b": [6]}) + >>> result = df.select(dfn.functions.lcm(dfn.col("a"), dfn.col("b")).alias("lcm")) + >>> result.collect_column("lcm")[0].as_py() + 12 + """ return Expr(f.lcm(x.expr, y.expr)) def left(string: Expr, n: Expr) -> Expr: - """Returns the first ``n`` characters in the ``string``.""" + """Returns the first ``n`` characters in the ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat"]}) + >>> left_df = df.select(dfn.functions.left(dfn.col("a"), dfn.lit(3)).alias("left")) + >>> left_df.collect_column("left")[0].as_py() + 'the' + """ return Expr(f.left(string.expr, n.expr)) def levenshtein(string1: Expr, string2: Expr) -> Expr: - """Returns the Levenshtein distance between the two given strings.""" + """Returns the Levenshtein distance between the two given strings. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["kitten"]}) + >>> result = df.select( + ... dfn.functions.levenshtein(dfn.col("a"), dfn.lit("sitting")).alias("d")) + >>> result = result + >>> result.collect_column("d")[0].as_py() + 3 + """ return Expr(f.levenshtein(string1.expr, string2.expr)) def ln(arg: Expr) -> Expr: - """Returns the natural logarithm (base e) of the argument.""" + """Returns the natural logarithm (base e) of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.ln(dfn.col("a")).alias("ln")) + >>> result.collect_column("ln")[0].as_py() + 0.0 + """ return Expr(f.ln(arg.expr)) def log(base: Expr, num: Expr) -> Expr: - """Returns the logarithm of a number for a particular ``base``.""" + """Returns the logarithm of a number for a particular ``base``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [100.0]}) + >>> result = df.select(dfn.functions.log(dfn.lit(10.0), dfn.col("a")).alias("log")) + >>> result.collect_column("log")[0].as_py() + 2.0 + """ return Expr(f.log(base.expr, num.expr)) def log10(arg: Expr) -> Expr: - """Base 10 logarithm of the argument.""" + """Base 10 logarithm of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [100.0]}) + >>> result = df.select(dfn.functions.log10(dfn.col("a")).alias("log10")) + >>> result.collect_column("log10")[0].as_py() + 2.0 + """ return Expr(f.log10(arg.expr)) def log2(arg: Expr) -> Expr: - """Base 2 logarithm of the argument.""" + """Base 2 logarithm of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [8.0]}) + >>> result = df.select(dfn.functions.log2(dfn.col("a")).alias("log2")) + >>> result.collect_column("log2")[0].as_py() + 3.0 + """ return Expr(f.log2(arg.expr)) def lower(arg: Expr) -> Expr: - """Converts a string to lowercase.""" + """Converts a string to lowercase. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["THE CaT"]}) + >>> lower_df = df.select(dfn.functions.lower(dfn.col("a")).alias("lower")) + >>> lower_df.collect_column("lower")[0].as_py() + 'the cat' + """ return Expr(f.lower(arg.expr)) @@ -704,33 +1287,100 @@ def lpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat", "a hat"]}) + >>> lpad_df = df.select(dfn.functions.lpad(dfn.col("a"), dfn.lit(6)).alias("lpad")) + >>> lpad_df.collect_column("lpad")[0].as_py() + 'the ca' + >>> lpad_df.collect_column("lpad")[1].as_py() + ' a hat' """ characters = characters if characters is not None else Expr.literal(" ") return Expr(f.lpad(string.expr, count.expr, characters.expr)) def ltrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from the beginning of a string.""" + """Removes all characters, spaces by default, from the beginning of a string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.ltrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + 'a ' + """ return Expr(f.ltrim(arg.expr)) def md5(arg: Expr) -> Expr: - """Computes an MD5 128-bit checksum for a string expression.""" + """Computes an MD5 128-bit checksum for a string expression. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5")) + >>> result.collect_column("md5")[0].as_py() + '5d41402abc4b2a76b9719d911017c592' + """ return Expr(f.md5(arg.expr)) def nanvl(x: Expr, y: Expr) -> Expr: - """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``.""" + """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``. + + Examples: + --------- + >>> import datafusion as dfn + >>> import numpy as np + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [np.nan, 1.0], "b": [0.0, 0.0]}) + >>> nanvl_df = df.select( + ... dfn.functions.nanvl(dfn.col("a"), dfn.col("b")).alias("nanvl")) + >>> nanvl_df.collect_column("nanvl")[0].as_py() + 0.0 + >>> nanvl_df.collect_column("nanvl")[1].as_py() + 1.0 + """ return Expr(f.nanvl(x.expr, y.expr)) def nvl(x: Expr, y: Expr) -> Expr: - """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.""" + """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) + >>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl")) + >>> nvl_df.collect_column("nvl")[0].as_py() + 0 + >>> nvl_df.collect_column("nvl")[1].as_py() + 1 + """ return Expr(f.nvl(x.expr, y.expr)) def octet_length(arg: Expr) -> Expr: - """Returns the number of bytes of a string.""" + """Returns the number of bytes of a string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.octet_length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.octet_length(arg.expr)) @@ -741,6 +1391,18 @@ def overlay( Replace the substring of string that starts at the ``start``'th character and extends for ``length`` characters with new substring. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcdef"]}) + >>> result = df.select( + ... dfn.functions.overlay(dfn.col("a"), dfn.lit("XY"), dfn.lit(3), + ... dfn.lit(2)).alias("o")) + >>> result = result + >>> result.collect_column("o")[0].as_py() + 'abXYef' """ if length is None: return Expr(f.overlay(string.expr, substring.expr, start.expr)) @@ -748,7 +1410,22 @@ def overlay( def pi() -> Expr: - """Returns an approximate value of π.""" + """Returns an approximate value of π. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> import builtins + >>> result = df.select( + ... dfn.functions.pi().alias("pi") + ... ) + >>> builtins.round( + ... result.collect_column("pi")[0].as_py(), 5 + ... ) + 3.14159 + """ return Expr(f.pi()) @@ -756,12 +1433,33 @@ def position(string: Expr, substring: Expr) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. This is an alias for :py:func:`strpos`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.position(dfn.col("a"), dfn.lit("llo")).alias("pos")) + >>> result = result + >>> result.collect_column("pos")[0].as_py() + 3 """ return strpos(string, substring) def power(base: Expr, exponent: Expr) -> Expr: - """Returns ``base`` raised to the power of ``exponent``.""" + """Returns ``base`` raised to the power of ``exponent``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0]}) + >>> result = df.select(dfn.functions.power(dfn.col("a"), dfn.lit(3.0)).alias("pow")) + >>> result.collect_column("pow")[0].as_py() + 8.0 + """ return Expr(f.power(base.expr, exponent.expr)) @@ -769,20 +1467,58 @@ def pow(base: Expr, exponent: Expr) -> Expr: """Returns ``base`` raised to the power of ``exponent``. This is an alias of :py:func:`power`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [3.0]}) + >>> result = df.select(dfn.functions.pow(dfn.col("a"), dfn.lit(2.0)).alias("pow")) + >>> result.collect_column("pow")[0].as_py() + 9.0 """ return power(base, exponent) def radians(arg: Expr) -> Expr: - """Converts the argument from degrees to radians.""" + """Converts the argument from degrees to radians. + + Examples: + --------- + >>> from math import pi + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [180.0]}) + >>> import builtins + >>> result = df.select( + ... dfn.functions.radians(dfn.col("a")).alias("rad") + ... ) + >>> builtins.round( + ... result.collect_column("rad")[0].as_py(), 6 + ... ) + 3.141593 + """ return Expr(f.radians(arg.expr)) def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: - """Find if any regular expression (regex) matches exist. + r"""Find if any regular expression (regex) matches exist. Tests a string using a regular expression returning true if at least one match, false otherwise. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello123"]}) + >>> result = df.select( + ... dfn.functions.regexp_like( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + True """ if flags is not None: flags = flags.expr @@ -790,10 +1526,23 @@ def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: - """Perform regular expression (regex) matching. + r"""Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the corresponding index in ``regex`` to string in ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_match( + ... dfn.col("a"), dfn.lit("(\\d+)") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + ['42'] """ if flags is not None: flags = flags.expr @@ -803,13 +1552,27 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: def regexp_replace( string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None ) -> Expr: - """Replaces substring(s) matching a PCRE-like regular expression. + r"""Replaces substring(s) matching a PCRE-like regular expression. The full list of supported features and syntax can be found at Supported flags with the addition of 'g' can be found at + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42"]}) + >>> result = df.select( + ... dfn.functions.regexp_replace( + ... dfn.col("a"), dfn.lit("\\d+"), + ... dfn.lit("XX") + ... ).alias("r") + ... ) + >>> result.collect_column("r")[0].as_py() + 'hello XX' """ if flags is not None: flags = flags.expr @@ -823,6 +1586,17 @@ def regexp_count( Optional start position (the first position is 1) to search for the regular expression. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcabc"]}) + >>> result = df.select( + ... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c")) + >>> result = result + >>> result.collect_column("c")[0].as_py() + 2 """ if flags is not None: flags = flags.expr @@ -838,12 +1612,25 @@ def regexp_instr( flags: Expr | None = None, sub_expr: Expr | None = None, ) -> Expr: - """Returns the position of a regular expression match in a string. + r"""Returns the position of a regular expression match in a string. Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position ``start`` (the first position is 1). Returns the starting or ending position based on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to return the position of a specific capture group instead of the entire match. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_instr( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("pos") + ... ) + >>> result.collect_column("pos")[0].as_py() + 7 """ start = start.expr if start is not None else None n = n.expr if n is not None else None @@ -863,22 +1650,65 @@ def regexp_instr( def repeat(string: Expr, n: Expr) -> Expr: - """Repeats the ``string`` to ``n`` times.""" + """Repeats the ``string`` to ``n`` times. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["ha"]}) + >>> result = df.select(dfn.functions.repeat(dfn.col("a"), dfn.lit(3)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hahaha' + """ return Expr(f.repeat(string.expr, n.expr)) def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``.""" + """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.replace(dfn.col("a"), dfn.lit("world"), + ... dfn.lit("there")).alias("r")) + >>> result = result + >>> result.collect_column("r")[0].as_py() + 'hello there' + """ return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) def reverse(arg: Expr) -> Expr: - """Reverse the string argument.""" + """Reverse the string argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.reverse(dfn.col("a")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'olleh' + """ return Expr(f.reverse(arg.expr)) def right(string: Expr, n: Expr) -> Expr: - """Returns the last ``n`` characters in the ``string``.""" + """Returns the last ``n`` characters in the ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.right(dfn.col("a"), dfn.lit(3)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'llo' + """ return Expr(f.right(string.expr, n.expr)) @@ -888,6 +1718,15 @@ def round(value: Expr, decimal_places: Expr | None = None) -> Expr: If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.567]}) + >>> result = df.select(dfn.functions.round(dfn.col("a"), dfn.lit(2)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 1.57 """ if decimal_places is None: decimal_places = Expr.literal(0) @@ -899,48 +1738,147 @@ def rpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hi"]}) + >>> result = df.select( + ... dfn.functions.rpad(dfn.col("a"), dfn.lit(5), dfn.lit("!")).alias("r")) + >>> result = result + >>> result.collect_column("r")[0].as_py() + 'hi!!!' """ characters = characters if characters is not None else Expr.literal(" ") return Expr(f.rpad(string.expr, count.expr, characters.expr)) def rtrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from the end of a string.""" + """Removes all characters, spaces by default, from the end of a string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.rtrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + ' a' + """ return Expr(f.rtrim(arg.expr)) def sha224(arg: Expr) -> Expr: - """Computes the SHA-224 hash of a binary string.""" + """Computes the SHA-224 hash of a binary string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha224(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha224(arg.expr)) def sha256(arg: Expr) -> Expr: - """Computes the SHA-256 hash of a binary string.""" + """Computes the SHA-256 hash of a binary string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha256(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha256(arg.expr)) def sha384(arg: Expr) -> Expr: - """Computes the SHA-384 hash of a binary string.""" + """Computes the SHA-384 hash of a binary string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha384(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha384(arg.expr)) def sha512(arg: Expr) -> Expr: - """Computes the SHA-512 hash of a binary string.""" + """Computes the SHA-512 hash of a binary string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha512(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha512(arg.expr)) def signum(arg: Expr) -> Expr: - """Returns the sign of the argument (-1, 0, +1).""" + """Returns the sign of the argument (-1, 0, +1). + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [-5.0, 0.0, 5.0]}) + >>> result = df.select(dfn.functions.signum(dfn.col("a")).alias("s")) + >>> result.collect_column("s").to_pylist() + [-1.0, 0.0, 1.0] + """ return Expr(f.signum(arg.expr)) def sin(arg: Expr) -> Expr: - """Returns the sine of the argument.""" + """Returns the sine of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.sin(dfn.col("a")).alias("sin")) + >>> result.collect_column("sin")[0].as_py() + 0.0 + """ return Expr(f.sin(arg.expr)) def sinh(arg: Expr) -> Expr: - """Returns the hyperbolic sine of the argument.""" + """Returns the hyperbolic sine of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.sinh(dfn.col("a")).alias("sinh")) + >>> result.collect_column("sinh")[0].as_py() + 0.0 + """ return Expr(f.sinh(arg.expr)) @@ -949,27 +1887,82 @@ def split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr: Splits a string based on a delimiter and picks out the desired field based on the index. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a,b,c"]}) + >>> result = df.select( + ... dfn.functions.split_part(dfn.col("a"), dfn.lit(","), dfn.lit(2)).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + 'b' """ return Expr(f.split_part(string.expr, delimiter.expr, index.expr)) def sqrt(arg: Expr) -> Expr: - """Returns the square root of the argument.""" + """Returns the square root of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [9.0]}) + >>> result = df.select(dfn.functions.sqrt(dfn.col("a")).alias("sqrt")) + >>> result.collect_column("sqrt")[0].as_py() + 3.0 + """ return Expr(f.sqrt(arg.expr)) def starts_with(string: Expr, prefix: Expr) -> Expr: - """Returns true if string starts with prefix.""" + """Returns true if string starts with prefix. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello_from_datafusion"]}) + >>> result = df.select( + ... dfn.functions.starts_with(dfn.col("a"), dfn.lit("hello")).alias("sw")) + >>> result = result + >>> result.collect_column("sw")[0].as_py() + True + """ return Expr(f.starts_with(string.expr, prefix.expr)) def strpos(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the ``substring`` matches the ``string``.""" + """Finds the position from where the ``substring`` matches the ``string``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.strpos(dfn.col("a"), dfn.lit("llo")).alias("pos")) + >>> result = result + >>> result.collect_column("pos")[0].as_py() + 3 + """ return Expr(f.strpos(string.expr, substring.expr)) def substr(string: Expr, position: Expr) -> Expr: - """Substring from the ``position`` to the end.""" + """Substring from the ``position`` to the end. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.substr(dfn.col("a"), dfn.lit(3)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'llo' + """ return Expr(f.substr(string.expr, position.expr)) @@ -978,27 +1971,81 @@ def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: The return will be the ``string`` from before ``count`` occurrences of ``delimiter``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a.b.c"]}) + >>> result = df.select( + ... dfn.functions.substr_index(dfn.col("a"), dfn.lit("."), + ... dfn.lit(2)).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + 'a.b' """ return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) def substring(string: Expr, position: Expr, length: Expr) -> Expr: - """Substring from the ``position`` with ``length`` characters.""" + """Substring from the ``position`` with ``length`` characters. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.substring(dfn.col("a"), dfn.lit(1), dfn.lit(5)).alias("s")) + >>> result = result + >>> result.collect_column("s")[0].as_py() + 'hello' + """ return Expr(f.substring(string.expr, position.expr, length.expr)) def tan(arg: Expr) -> Expr: - """Returns the tangent of the argument.""" + """Returns the tangent of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.tan(dfn.col("a")).alias("tan")) + >>> result.collect_column("tan")[0].as_py() + 0.0 + """ return Expr(f.tan(arg.expr)) def tanh(arg: Expr) -> Expr: - """Returns the hyperbolic tangent of the argument.""" + """Returns the hyperbolic tangent of the argument. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.tanh(dfn.col("a")).alias("tanh")) + >>> result.collect_column("tanh")[0].as_py() + 0.0 + """ return Expr(f.tanh(arg.expr)) def to_hex(arg: Expr) -> Expr: - """Converts an integer to a hexadecimal string.""" + """Converts an integer to a hexadecimal string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [255]}) + >>> result = df.select(dfn.functions.to_hex(dfn.col("a")).alias("hex")) + >>> result.collect_column("hex")[0].as_py() + 'ff' + """ return Expr(f.to_hex(arg.expr)) @@ -1006,6 +2053,21 @@ def now() -> Expr: """Returns the current timestamp in nanoseconds. This will use the same value for all instances of now() in same statement. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.now().alias("now") + ... ) + + Use .value instead of .as_py() because nanosecond timestamps + require pandas to convert to Python datetime objects. + + >>> result.collect_column("now")[0].value > 0 + True """ return Expr(f.now()) @@ -1016,6 +2078,19 @@ def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: For usage of ``formatters`` see the rust chrono package ``strftime`` package. [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ if formatters is None: return f.to_timestamp(arg.expr) @@ -1028,6 +2103,19 @@ def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in milliseconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_millis( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_millis(arg.expr, *formatters)) @@ -1037,6 +2125,19 @@ def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in microseconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_micros( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_micros(arg.expr, *formatters)) @@ -1046,6 +2147,19 @@ def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_nanos( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) @@ -1055,24 +2169,75 @@ def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in seconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_seconds( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) def to_unixtime(string: Expr, *format_arguments: Expr) -> Expr: - """Converts a string and optional formats to a Unixtime.""" + """Converts a string and optional formats to a Unixtime. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["1970-01-01T00:00:00"]}) + >>> result = df.select(dfn.functions.to_unixtime(dfn.col("a")).alias("u")) + >>> result.collect_column("u")[0].as_py() + 0 + """ args = [f.expr for f in format_arguments] return Expr(f.to_unixtime(string.expr, *args)) def current_date() -> Expr: - """Returns current UTC date as a Date32 value.""" + """Returns current UTC date as a Date32 value. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.current_date().alias("d") + ... ) + >>> result.collect_column("d")[0].as_py() is not None + True + """ return Expr(f.current_date()) def current_time() -> Expr: - """Returns current UTC time as a Time64 value.""" + """Returns current UTC time as a Time64 value. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.current_time().alias("t") + ... ) + + Use .value instead of .as_py() because nanosecond timestamps + require pandas to convert to Python datetime objects. + + >>> result.collect_column("t")[0].value > 0 + True + """ return Expr(f.current_time()) @@ -1080,12 +2245,37 @@ def datepart(part: Expr, date: Expr) -> Expr: """Return a specified part of a date. This is an alias for :py:func:`date_part`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T00:00:00"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.datepart(dfn.lit("month"), dfn.col("a")).alias("m")) + >>> result = result + >>> result.collect_column("m")[0].as_py() + 7 """ return date_part(part, date) def date_part(part: Expr, date: Expr) -> Expr: - """Extracts a subfield from the date.""" + """Extracts a subfield from the date. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T00:00:00"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.date_part(dfn.lit("year"), dfn.col("a")).alias("y")) + >>> result = result + >>> result.collect_column("y")[0].as_py() + 2021 + """ return Expr(f.date_part(part.expr, date.expr)) @@ -1093,12 +2283,39 @@ def extract(part: Expr, date: Expr) -> Expr: """Extracts a subfield from the date. This is an alias for :py:func:`date_part`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T00:00:00"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.extract(dfn.lit("day"), dfn.col("a")).alias("d")) + >>> result = result + >>> result.collect_column("d")[0].as_py() + 15 """ return date_part(part, date) def date_trunc(part: Expr, date: Expr) -> Expr: - """Truncates the date to a specified level of precision.""" + """Truncates the date to a specified level of precision. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T12:34:56"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.date_trunc( + ... dfn.lit("month"), dfn.col("a") + ... ).alias("t") + ... ) + >>> str(result.collect_column("t")[0].as_py()) + '2021-07-01 00:00:00' + """ return Expr(f.date_trunc(part.expr, date.expr)) @@ -1106,44 +2323,140 @@ def datetrunc(part: Expr, date: Expr) -> Expr: """Truncates the date to a specified level of precision. This is an alias for :py:func:`date_trunc`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T12:34:56"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.datetrunc( + ... dfn.lit("year"), dfn.col("a") + ... ).alias("t") + ... ) + >>> str(result.collect_column("t")[0].as_py()) + '2021-01-01 00:00:00' """ return date_trunc(part, date) def date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr: - """Coerces an arbitrary timestamp to the start of the nearest specified interval.""" + """Coerces an arbitrary timestamp to the start of the nearest specified interval. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> result = ctx.sql( + ... "SELECT date_bin(interval '1 day'," + ... " timestamp '2021-07-15 12:34:56'," + ... " timestamp '2021-01-01') as b" + ... ) + >>> str(result.collect_column("b")[0].as_py()) + '2021-07-15 00:00:00' + """ return Expr(f.date_bin(stride.expr, source.expr, origin.expr)) def make_date(year: Expr, month: Expr, day: Expr) -> Expr: - """Make a date from year, month and day component parts.""" + """Make a date from year, month and day component parts. + + Examples: + --------- + >>> import datafusion as dfn + >>> from datetime import date + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2024], "m": [1], "d": [15]}) + >>> result = df.select( + ... dfn.functions.make_date(dfn.col("y"), dfn.col("m"), + ... dfn.col("d")).alias("dt")) + >>> result = result + >>> result.collect_column("dt")[0].as_py() + datetime.date(2024, 1, 15) + """ return Expr(f.make_date(year.expr, month.expr, day.expr)) def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the characters in ``from_val`` with the counterpart in ``to_val``.""" + """Replaces the characters in ``from_val`` with the counterpart in ``to_val``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.translate(dfn.col("a"), dfn.lit("helo"), + ... dfn.lit("HELO")).alias("t")) + >>> result = result + >>> result.collect_column("t")[0].as_py() + 'HELLO' + """ return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) def trim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from both sides of a string.""" + """Removes all characters, spaces by default, from both sides of a string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" hello "]}) + >>> result = df.select(dfn.functions.trim(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'hello' + """ return Expr(f.trim(arg.expr)) def trunc(num: Expr, precision: Expr | None = None) -> Expr: - """Truncate the number toward zero with optional precision.""" + """Truncate the number toward zero with optional precision. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.567]}) + >>> result = df.select(dfn.functions.trunc(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 1.0 + """ if precision is not None: return Expr(f.trunc(num.expr, precision.expr)) return Expr(f.trunc(num.expr)) def upper(arg: Expr) -> Expr: - """Converts a string to uppercase.""" + """Converts a string to uppercase. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.upper(dfn.col("a")).alias("u")) + >>> result.collect_column("u")[0].as_py() + 'HELLO' + """ return Expr(f.upper(arg.expr)) def make_array(*args: Expr) -> Expr: - """Returns an array using the specified input expressions.""" + """Returns an array using the specified input expressions. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.make_array(dfn.lit(1), dfn.lit(2), dfn.lit(3)).alias("arr")) + >>> result = result + >>> result.collect_column("arr")[0].as_py() + [1, 2, 3] + """ args = [arg.expr for arg in args] return Expr(f.make_array(args)) @@ -1152,6 +2465,15 @@ def make_list(*args: Expr) -> Expr: """Returns an array using the specified input expressions. This is an alias for :py:func:`make_array`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select(dfn.functions.make_list(dfn.lit(4), dfn.lit(5)).alias("arr")) + >>> result.collect_column("arr")[0].as_py() + [4, 5] """ return make_array(*args) @@ -1160,28 +2482,89 @@ def array(*args: Expr) -> Expr: """Returns an array using the specified input expressions. This is an alias for :py:func:`make_array`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select(dfn.functions.array(dfn.lit(7), dfn.lit(8)).alias("arr")) + >>> result.collect_column("arr")[0].as_py() + [7, 8] """ return make_array(*args) def range(start: Expr, stop: Expr, step: Expr) -> Expr: - """Create a list of values in the range between start and stop.""" + """Create a list of values in the range between start and stop. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.range(dfn.lit(0), dfn.lit(5), dfn.lit(2)).alias("r")) + >>> result = result + >>> result.collect_column("r")[0].as_py() + [0, 2, 4] + """ return Expr(f.range(start.expr, stop.expr, step.expr)) def uuid() -> Expr: - """Returns uuid v4 as a string value.""" + """Returns uuid v4 as a string value. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.uuid().alias("u") + ... ) + >>> len(result.collect_column("u")[0].as_py()) == 36 + True + """ return Expr(f.uuid()) def struct(*args: Expr) -> Expr: - """Returns a struct with the given arguments.""" + """Returns a struct with the given arguments. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1], "b": [2]}) + >>> result = df.select( + ... dfn.functions.struct( + ... dfn.col("a"), dfn.col("b") + ... ).alias("s") + ... ) + >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2} + True + """ args = [arg.expr for arg in args] return Expr(f.struct(*args)) def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: - """Returns a struct with the given names and arguments pairs.""" + """Returns a struct with the given names and arguments pairs. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.named_struct( + ... [("x", dfn.lit(10)), ("y", dfn.lit(20))] + ... ).alias("s") + ... ) + >>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20} + True + """ name_pair_exprs = [ [Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]] for pair in name_pairs @@ -1193,27 +2576,87 @@ def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: def from_unixtime(arg: Expr) -> Expr: - """Converts an integer to RFC3339 timestamp format string.""" + """Converts an integer to RFC3339 timestamp format string. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0]}) + >>> result = df.select( + ... dfn.functions.from_unixtime( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '1970-01-01 00:00:00' + """ return Expr(f.from_unixtime(arg.expr)) def arrow_typeof(arg: Expr) -> Expr: - """Returns the Arrow type of the expression.""" + """Returns the Arrow type of the expression. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'Int64' + """ return Expr(f.arrow_typeof(arg.expr)) def arrow_cast(expr: Expr, data_type: Expr) -> Expr: - """Casts an expression to a specified data type.""" + """Casts an expression to a specified data type. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> result = ctx.sql( + ... "SELECT arrow_cast(1, 'Float64') as c" + ... ) + >>> result.collect_column("c")[0].as_py() + 1.0 + """ return Expr(f.arrow_cast(expr.expr, data_type.expr)) def random() -> Expr: - """Returns a random value in the range ``0.0 <= x < 1.0``.""" + """Returns a random value in the range ``0.0 <= x < 1.0``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.random().alias("r") + ... ) + >>> val = result.collect_column("r")[0].as_py() + >>> 0.0 <= val < 1.0 + True + """ return Expr(f.random()) def array_append(array: Expr, element: Expr) -> Expr: - """Appends an element to the end of an array.""" + """Appends an element to the end of an array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_append(dfn.col("a"), dfn.lit(4)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] + """ return Expr(f.array_append(array.expr, element.expr)) @@ -1221,6 +2664,17 @@ def array_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. This is an alias for :py:func:`array_append`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_push_back(dfn.col("a"), dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3] """ return array_append(array, element) @@ -1229,6 +2683,17 @@ def list_append(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. This is an alias for :py:func:`array_append`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.list_append(dfn.col("a"), dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3] """ return array_append(array, element) @@ -1237,12 +2702,35 @@ def list_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. This is an alias for :py:func:`array_append`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.list_push_back(dfn.col("a"), dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3] """ return array_append(array, element) def array_concat(*args: Expr) -> Expr: - """Concatenates the input arrays.""" + """Concatenates the input arrays. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_concat(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] + """ args = [arg.expr for arg in args] return Expr(f.array_concat(args)) @@ -1251,17 +2739,54 @@ def array_cat(*args: Expr) -> Expr: """Concatenates the input arrays. This is an alias for :py:func:`array_concat`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_cat(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] """ return array_concat(*args) def array_dims(array: Expr) -> Expr: - """Returns an array of the array's dimensions.""" + """Returns an array of the array's dimensions. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_dims(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [3] + """ return Expr(f.array_dims(array.expr)) def array_distinct(array: Expr) -> Expr: - """Returns distinct values from the array after removing duplicates.""" + """Returns distinct values from the array after removing duplicates. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_distinct( + ... dfn.col("a") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [1, 2, 3] + """ return Expr(f.array_distinct(array.expr)) @@ -1269,6 +2794,17 @@ def list_cat(*args: Expr) -> Expr: """Concatenates the input arrays. This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + >>> result = df.select( + ... dfn.functions.list_cat(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] """ return array_concat(*args) @@ -1277,6 +2813,17 @@ def list_concat(*args: Expr) -> Expr: """Concatenates the input arrays. This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + >>> result = df.select( + ... dfn.functions.list_concat(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] """ return array_concat(*args) @@ -1285,6 +2832,21 @@ def list_distinct(array: Expr) -> Expr: """Returns distinct values from the array after removing duplicates. This is an alias for :py:func:`array_distinct`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.list_distinct( + ... dfn.col("a") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [1, 2, 3] """ return array_distinct(array) @@ -1293,17 +2855,48 @@ def list_dims(array: Expr) -> Expr: """Returns an array of the array's dimensions. This is an alias for :py:func:`array_dims`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.list_dims(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [3] """ return array_dims(array) def array_element(array: Expr, n: Expr) -> Expr: - """Extracts the element with the index n from the array.""" + """Extracts the element with the index n from the array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.array_element(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 20 + """ return Expr(f.array_element(array.expr, n.expr)) def array_empty(array: Expr) -> Expr: - """Returns a boolean indicating whether the array is empty.""" + """Returns a boolean indicating whether the array is empty. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select(dfn.functions.array_empty(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + False + """ return Expr(f.array_empty(array.expr)) @@ -1311,6 +2904,17 @@ def array_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. This is an alias for :py:func:`array_element`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.array_extract(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 20 """ return array_element(array, n) @@ -1319,6 +2923,17 @@ def list_element(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. This is an alias for :py:func:`array_element`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.list_element(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 20 """ return array_element(array, n) @@ -1327,12 +2942,33 @@ def list_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. This is an alias for :py:func:`array_element`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.list_extract(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 20 """ return array_element(array, n) def array_length(array: Expr) -> Expr: - """Returns the length of the array.""" + """Returns the length of the array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_length(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 3 + """ return Expr(f.array_length(array.expr)) @@ -1340,12 +2976,33 @@ def list_length(array: Expr) -> Expr: """Returns the length of the array. This is an alias for :py:func:`array_length`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.list_length(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 3 """ return array_length(array) def array_has(first_array: Expr, second_array: Expr) -> Expr: - """Returns true if the element appears in the first array, otherwise false.""" + """Returns true if the element appears in the first array, otherwise false. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_has(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + True + """ return Expr(f.array_has(first_array.expr, second_array.expr)) @@ -1354,6 +3011,17 @@ def array_has_all(first_array: Expr, second_array: Expr) -> Expr: Returns true if each element of the second array appears in the first array. Otherwise, it returns false. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_has_all(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + True """ return Expr(f.array_has_all(first_array.expr, second_array.expr)) @@ -1363,12 +3031,35 @@ def array_has_any(first_array: Expr, second_array: Expr) -> Expr: Returns true if at least one element of the second array appears in the first array. Otherwise, it returns false. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 5]]}) + >>> result = df.select( + ... dfn.functions.array_has_any(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + True """ return Expr(f.array_has_any(first_array.expr, second_array.expr)) def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: - """Return the position of the first occurrence of ``element`` in ``array``.""" + """Return the position of the first occurrence of ``element`` in ``array``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.array_position(dfn.col("a"), dfn.lit(20)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 2 + """ return Expr(f.array_position(array.expr, element.expr, index)) @@ -1376,6 +3067,17 @@ def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. This is an alias for :py:func:`array_position`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.array_indexof(dfn.col("a"), dfn.lit(20)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 2 """ return array_position(array, element, index) @@ -1384,6 +3086,17 @@ def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. This is an alias for :py:func:`array_position`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.list_position(dfn.col("a"), dfn.lit(20)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 2 """ return array_position(array, element, index) @@ -1392,12 +3105,35 @@ def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. This is an alias for :py:func:`array_position`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.list_indexof(dfn.col("a"), dfn.lit(20)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + 2 """ return array_position(array, element, index) def array_positions(array: Expr, element: Expr) -> Expr: - """Searches for an element in the array and returns all occurrences.""" + """Searches for an element in the array and returns all occurrences. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_positions(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 3] + """ return Expr(f.array_positions(array.expr, element.expr)) @@ -1405,12 +3141,33 @@ def list_positions(array: Expr, element: Expr) -> Expr: """Searches for an element in the array and returns all occurrences. This is an alias for :py:func:`array_positions`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.list_positions(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 3] """ return array_positions(array, element) def array_ndims(array: Expr) -> Expr: - """Returns the number of dimensions of the array.""" + """Returns the number of dimensions of the array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_ndims(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 1 + """ return Expr(f.array_ndims(array.expr)) @@ -1418,12 +3175,33 @@ def list_ndims(array: Expr) -> Expr: """Returns the number of dimensions of the array. This is an alias for :py:func:`array_ndims`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.list_ndims(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 1 """ return array_ndims(array) def array_prepend(element: Expr, array: Expr) -> Expr: - """Prepends an element to the beginning of an array.""" + """Prepends an element to the beginning of an array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_prepend(dfn.lit(0), dfn.col("a")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [0, 1, 2] + """ return Expr(f.array_prepend(element.expr, array.expr)) @@ -1431,6 +3209,17 @@ def array_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. This is an alias for :py:func:`array_prepend`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_push_front(dfn.lit(0), dfn.col("a")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [0, 1, 2] """ return array_prepend(element, array) @@ -1439,6 +3228,17 @@ def list_prepend(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. This is an alias for :py:func:`array_prepend`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.list_prepend(dfn.lit(0), dfn.col("a")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [0, 1, 2] """ return array_prepend(element, array) @@ -1447,22 +3247,65 @@ def list_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. This is an alias for :py:func:`array_prepend`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.list_push_front(dfn.lit(0), dfn.col("a")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [0, 1, 2] """ return array_prepend(element, array) def array_pop_back(array: Expr) -> Expr: - """Returns the array without the last element.""" + """Returns the array without the last element. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_pop_back(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2] + """ return Expr(f.array_pop_back(array.expr)) def array_pop_front(array: Expr) -> Expr: - """Returns the array without the first element.""" + """Returns the array without the first element. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_pop_front(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [2, 3] + """ return Expr(f.array_pop_front(array.expr)) def array_remove(array: Expr, element: Expr) -> Expr: - """Removes the first element from the array equal to the given value.""" + """Removes the first element from the array equal to the given value. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_remove(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2, 1] + """ return Expr(f.array_remove(array.expr, element.expr)) @@ -1470,12 +3313,36 @@ def list_remove(array: Expr, element: Expr) -> Expr: """Removes the first element from the array equal to the given value. This is an alias for :py:func:`array_remove`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.list_remove(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2, 1] """ return array_remove(array, element) def array_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: - """Removes the first ``max`` elements from the array equal to the given value.""" + """Removes the first ``max`` elements from the array equal to the given value. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]}) + >>> result = df.select( + ... dfn.functions.array_remove_n(dfn.col("a"), dfn.lit(1), + ... dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2, 1] + """ return Expr(f.array_remove_n(array.expr, element.expr, max.expr)) @@ -1483,12 +3350,36 @@ def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: """Removes the first ``max`` elements from the array equal to the given value. This is an alias for :py:func:`array_remove_n`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]}) + >>> result = df.select( + ... dfn.functions.list_remove_n(dfn.col("a"), dfn.lit(1), + ... dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2, 1] """ return array_remove_n(array, element, max) def array_remove_all(array: Expr, element: Expr) -> Expr: - """Removes all elements from the array equal to the given value.""" + """Removes all elements from the array equal to the given value. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_remove_all(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2] + """ return Expr(f.array_remove_all(array.expr, element.expr)) @@ -1496,12 +3387,35 @@ def list_remove_all(array: Expr, element: Expr) -> Expr: """Removes all elements from the array equal to the given value. This is an alias for :py:func:`array_remove_all`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.list_remove_all(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2] """ return array_remove_all(array, element) def array_repeat(element: Expr, count: Expr) -> Expr: - """Returns an array containing ``element`` ``count`` times.""" + """Returns an array containing ``element`` ``count`` times. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.array_repeat(dfn.lit(3), dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [3, 3, 3] + """ return Expr(f.array_repeat(element.expr, count.expr)) @@ -1509,12 +3423,36 @@ def list_repeat(element: Expr, count: Expr) -> Expr: """Returns an array containing ``element`` ``count`` times. This is an alias for :py:func:`array_repeat`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.list_repeat(dfn.lit(3), dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [3, 3, 3] """ return array_repeat(element, count) def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the first occurrence of ``from_val`` with ``to_val``.""" + """Replaces the first occurrence of ``from_val`` with ``to_val``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_replace(dfn.col("a"), dfn.lit(1), + ... dfn.lit(9)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [9, 2, 1] + """ return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr)) @@ -1522,6 +3460,18 @@ def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces the first occurrence of ``from_val`` with ``to_val``. This is an alias for :py:func:`array_replace`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.list_replace(dfn.col("a"), dfn.lit(1), + ... dfn.lit(9)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [9, 2, 1] """ return array_replace(array, from_val, to_val) @@ -1531,6 +3481,18 @@ def array_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Exp Replaces the first ``max`` occurrences of the specified element with another specified element. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]}) + >>> result = df.select( + ... dfn.functions.array_replace_n(dfn.col("a"), dfn.lit(1), dfn.lit(9), + ... dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [9, 2, 9, 1] """ return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr, max.expr)) @@ -1542,12 +3504,37 @@ def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr specified element. This is an alias for :py:func:`array_replace_n`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]}) + >>> result = df.select( + ... dfn.functions.list_replace_n(dfn.col("a"), dfn.lit(1), dfn.lit(9), + ... dfn.lit(2)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [9, 2, 9, 1] """ return array_replace_n(array, from_val, to_val, max) def array_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of ``from_val`` with ``to_val``.""" + """Replaces all occurrences of ``from_val`` with ``to_val``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_replace_all(dfn.col("a"), dfn.lit(1), + ... dfn.lit(9)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [9, 2, 9] + """ return Expr(f.array_replace_all(array.expr, from_val.expr, to_val.expr)) @@ -1555,6 +3542,18 @@ def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces all occurrences of ``from_val`` with ``to_val``. This is an alias for :py:func:`array_replace_all`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.list_replace_all(dfn.col("a"), dfn.lit(1), + ... dfn.lit(9)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [9, 2, 9] """ return array_replace_all(array, from_val, to_val) @@ -1566,6 +3565,15 @@ def array_sort(array: Expr, descending: bool = False, null_first: bool = False) array: The input array to sort. descending: If True, sorts in descending order. null_first: If True, nulls will be returned at the beginning of the array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[3, 1, 2]]}) + >>> result = df.select(dfn.functions.array_sort(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3] """ desc = "DESC" if descending else "ASC" nulls_first = "NULLS FIRST" if null_first else "NULLS LAST" @@ -1579,14 +3587,37 @@ def array_sort(array: Expr, descending: bool = False, null_first: bool = False) def list_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: - """This is an alias for :py:func:`array_sort`.""" + """This is an alias for :py:func:`array_sort`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[3, 1, 2]]}) + >>> result = df.select(dfn.functions.list_sort(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3] + """ return array_sort(array, descending=descending, null_first=null_first) def array_slice( array: Expr, begin: Expr, end: Expr, stride: Expr | None = None ) -> Expr: - """Returns a slice of the array.""" + """Returns a slice of the array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_slice(dfn.col("a"), dfn.lit(2), + ... dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2, 3] + """ if stride is not None: stride = stride.expr return Expr(f.array_slice(array.expr, begin.expr, end.expr, stride)) @@ -1596,12 +3627,40 @@ def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None = None) """Returns a slice of the array. This is an alias for :py:func:`array_slice`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.list_slice(dfn.col("a"), dfn.lit(2), + ... dfn.lit(3)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [2, 3] """ return array_slice(array, begin, end, stride) def array_intersect(array1: Expr, array2: Expr) -> Expr: - """Returns the intersection of ``array1`` and ``array2``.""" + """Returns the intersection of ``array1`` and ``array2``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_intersect( + ... dfn.col("a"), dfn.col("b") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [2, 3] + """ return Expr(f.array_intersect(array1.expr, array2.expr)) @@ -1609,6 +3668,21 @@ def list_intersect(array1: Expr, array2: Expr) -> Expr: """Returns an the intersection of ``array1`` and ``array2``. This is an alias for :py:func:`array_intersect`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.list_intersect( + ... dfn.col("a"), dfn.col("b") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [2, 3] """ return array_intersect(array1, array2) @@ -1617,6 +3691,21 @@ def array_union(array1: Expr, array2: Expr) -> Expr: """Returns an array of the elements in the union of array1 and array2. Duplicate rows will not be returned. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_union( + ... dfn.col("a"), dfn.col("b") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [1, 2, 3, 4] """ return Expr(f.array_union(array1.expr, array2.expr)) @@ -1627,12 +3716,39 @@ def list_union(array1: Expr, array2: Expr) -> Expr: Duplicate rows will not be returned. This is an alias for :py:func:`array_union`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.list_union( + ... dfn.col("a"), dfn.col("b") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [1, 2, 3, 4] """ return array_union(array1, array2) def array_except(array1: Expr, array2: Expr) -> Expr: - """Returns the elements that appear in ``array1`` but not in ``array2``.""" + """Returns the elements that appear in ``array1`` but not in ``array2``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_except(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1] + """ return Expr(f.array_except(array1.expr, array2.expr)) @@ -1640,6 +3756,17 @@ def list_except(array1: Expr, array2: Expr) -> Expr: """Returns the elements that appear in ``array1`` but not in the ``array2``. This is an alias for :py:func:`array_except`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.list_except(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1] """ return array_except(array1, array2) @@ -1649,6 +3776,18 @@ def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: If ``size`` is greater than the ``array`` length, the additional entries will be filled with the given ``value``. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_resize(dfn.col("a"), dfn.lit(4), + ... dfn.lit(0)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 0, 0] """ return Expr(f.array_resize(array.expr, size.expr, value.expr)) @@ -1658,22 +3797,64 @@ def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: If ``size`` is greater than the ``array`` length, the additional entries will be filled with the given ``value``. This is an alias for :py:func:`array_resize`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.list_resize(dfn.col("a"), dfn.lit(4), + ... dfn.lit(0)).alias("result")) + >>> result = result + >>> result.collect_column("result")[0].as_py() + [1, 2, 0, 0] """ return array_resize(array, size, value) def flatten(array: Expr) -> Expr: - """Flattens an array of arrays into a single array.""" + """Flattens an array of arrays into a single array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[[1, 2], [3, 4]]]}) + >>> result = df.select(dfn.functions.flatten(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] + """ return Expr(f.flatten(array.expr)) def cardinality(array: Expr) -> Expr: - """Returns the total number of elements in the array.""" + """Returns the total number of elements in the array. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.cardinality(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 3 + """ return Expr(f.cardinality(array.expr)) def empty(array: Expr) -> Expr: - """This is an alias for :py:func:`array_empty`.""" + """This is an alias for :py:func:`array_empty`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select(dfn.functions.empty(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + False + """ return array_empty(array) @@ -1694,6 +3875,17 @@ def approx_distinct( Args: expression: Values to check for distinct entries filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() >= 2 + True """ filter_raw = filter.expr if filter is not None else None @@ -1712,6 +3904,17 @@ def approx_median(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Values to find the median for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.approx_median(expression.expr, filter=filter_raw)) @@ -1743,6 +3946,17 @@ def approx_percentile_cont( percentile: This must be between 0.0 and 1.0, inclusive num_centroids: Max bin size for the t-digest algorithm filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 0.5).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 3.0 """ sort_expr_raw = sort_or_default(sort_expression) filter_raw = filter.expr if filter is not None else None @@ -1775,6 +3989,17 @@ def approx_percentile_cont_with_weight( num_centroids: Max bin size for the t-digest algorithm filter: If provided, only compute against rows for which the filter is True + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"), + ... dfn.col("w"), 0.5).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ sort_expr_raw = sort_or_default(sort_expression) filter_raw = filter.expr if filter is not None else None @@ -1813,6 +4038,15 @@ def array_agg( For example:: df.aggregate([], array_agg(col("a"), order_by="b")) + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.array_agg(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + [1, 2, 3] """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -1838,6 +4072,15 @@ def avg( Args: expression: Values to combine into an array filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.avg(expression.expr, filter=filter_raw)) @@ -1855,6 +4098,17 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: value_y: The dependent variable for correlation value_x: The independent variable for correlation filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw)) @@ -1876,6 +4130,15 @@ def count( expressions: Argument to perform bitwise calculation on distinct: If True, a single entry for each distinct value will be in the result filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.count(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None @@ -1901,6 +4164,23 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: value_y: The dependent variable for covariance value_x: The independent variable for covariance filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> import builtins + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], + ... [dfn.functions.covar_pop( + ... dfn.col("a"), dfn.col("b") + ... ).alias("v")] + ... ) + >>> builtins.round( + ... result.collect_column("v")[0].as_py(), 4 + ... ) + 0.6667 """ filter_raw = filter.expr if filter is not None else None return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw)) @@ -1918,6 +4198,17 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr value_y: The dependent variable for covariance value_x: The independent variable for covariance filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw)) @@ -1927,6 +4218,17 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Computes the sample covariance. This is an alias for :py:func:`covar_samp`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 1.0 """ return covar_samp(value_y, value_x, filter) @@ -1940,6 +4242,15 @@ def max(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the maximum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.max(expression.expr, filter=filter_raw)) @@ -1949,6 +4260,15 @@ def mean(expression: Expr, filter: Expr | None = None) -> Expr: """Returns the average (mean) value of the argument. This is an alias for :py:func:`avg`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.mean(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ return avg(expression, filter) @@ -1968,6 +4288,15 @@ def median( expression: The value to compute the median of distinct: If True, a single entry for each distinct value will be in the result filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.median(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw)) @@ -1982,6 +4311,15 @@ def min(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1 """ filter_raw = filter.expr if filter is not None else None return Expr(f.min(expression.expr, filter=filter_raw)) @@ -2001,6 +4339,15 @@ def sum( Args: expression: Values to combine into an array filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 6 """ filter_raw = filter.expr if filter is not None else None return Expr(f.sum(expression.expr, filter=filter_raw)) @@ -2015,6 +4362,15 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.stddev(expression.expr, filter=filter_raw)) @@ -2029,6 +4385,15 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.stddev_pop(expression.expr, filter=filter_raw)) @@ -2038,6 +4403,15 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr: """Computes the sample standard deviation of the argument. This is an alias for :py:func:`stddev`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ return stddev(arg, filter=filter) @@ -2046,6 +4420,15 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return var_samp(expression, filter) @@ -2059,6 +4442,15 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The variable to compute the variance for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0, 2.0]}) + >>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.var_pop(expression.expr, filter=filter_raw)) @@ -2073,6 +4465,15 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The variable to compute the variance for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.var_sample(expression.expr, filter=filter_raw)) @@ -2082,6 +4483,15 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return var_samp(expression, filter) @@ -2103,6 +4513,17 @@ def regr_avgx( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 5.0 """ filter_raw = filter.expr if filter is not None else None @@ -2126,6 +4547,17 @@ def regr_avgy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2149,6 +4581,17 @@ def regr_count( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None @@ -2172,6 +4615,17 @@ def regr_intercept( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 0.0 """ filter_raw = filter.expr if filter is not None else None @@ -2195,6 +4649,17 @@ def regr_r2( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None @@ -2218,6 +4683,17 @@ def regr_slope( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2241,6 +4717,17 @@ def regr_sxx( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2264,6 +4751,17 @@ def regr_sxy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2287,6 +4785,17 @@ def regr_syy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result = result + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2316,6 +4825,15 @@ def first_value( For example:: df.aggregate([], first_value(col("a"), order_by="ts")) + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.aggregate([], [dfn.functions.first_value(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 10 """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2353,6 +4871,15 @@ def last_value( For example:: df.aggregate([], last_value(col("a"), order_by="ts")) + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.aggregate([], [dfn.functions.last_value(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 30 """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2392,6 +4919,15 @@ def nth_value( For example:: df.aggregate([], nth_value(col("a"), 2, order_by="ts")) + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.aggregate([], [dfn.functions.nth_value(dfn.col("a"), 2).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 20 """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2418,6 +4954,15 @@ def bit_and(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [7, 3]}) + >>> result = df.aggregate([], [dfn.functions.bit_and(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_and(expression.expr, filter=filter_raw)) @@ -2434,6 +4979,15 @@ def bit_or(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2]}) + >>> result = df.aggregate([], [dfn.functions.bit_or(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_or(expression.expr, filter=filter_raw)) @@ -2453,6 +5007,15 @@ def bit_xor( expression: Argument to perform bitwise calculation on distinct: If True, evaluate each unique value of expression only once filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [5, 3]}) + >>> result = df.aggregate([], [dfn.functions.bit_xor(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 6 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw)) @@ -2470,6 +5033,15 @@ def bool_and(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [True, True, False]}) + >>> result = df.aggregate([], [dfn.functions.bool_and(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + False """ filter_raw = filter.expr if filter is not None else None return Expr(f.bool_and(expression.expr, filter=filter_raw)) @@ -2487,6 +5059,15 @@ def bool_or(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: Argument to perform calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [False, False, True]}) + >>> result = df.aggregate([], [dfn.functions.bool_or(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + True """ filter_raw = filter.expr if filter is not None else None return Expr(f.bool_or(expression.expr, filter=filter_raw)) @@ -2532,6 +5113,17 @@ def lead( For example:: lead(col("b"), order_by="ts") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.lead(dfn.col("a"), shift_offset=1, + ... default_value=0, order_by="a").alias("lead")) + >>> result.sort(dfn.col("a")).collect_column("lead").to_pylist() + [2, 3, 0] """ if not isinstance(default_value, pa.Scalar) and default_value is not None: default_value = pa.scalar(default_value) @@ -2587,6 +5179,17 @@ def lag( For example:: lag(col("b"), order_by="ts") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.lag(dfn.col("a"), shift_offset=1, + ... default_value=0, order_by="a").alias("lag")) + >>> result.sort(dfn.col("a")).collect_column("lag").to_pylist() + [0, 1, 2] """ if not isinstance(default_value, pa.Scalar): default_value = pa.scalar(default_value) @@ -2632,6 +5235,16 @@ def row_number( For example:: row_number(order_by="points") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.row_number(order_by="a").alias("rn")) + >>> result.sort(dfn.col("a")).collect_column("rn").to_pylist() + [1, 2, 3] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -2676,6 +5289,15 @@ def rank( For example:: rank(order_by="points") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 10, 20]}) + >>> result = df.select(dfn.col("a"), dfn.functions.rank(order_by="a").alias("rnk")) + >>> result.sort(dfn.col("a")).collect_column("rnk").to_pylist() + [1, 1, 3] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -2715,6 +5337,16 @@ def dense_rank( For example:: dense_rank(order_by="points") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 10, 20]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.dense_rank(order_by="a").alias("dr")) + >>> result.sort(dfn.col("a")).collect_column("dr").to_pylist() + [1, 1, 2] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -2755,6 +5387,16 @@ def percent_rank( For example:: percent_rank(order_by="points") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.percent_rank(order_by="a").alias("pr")) + >>> result.sort(dfn.col("a")).collect_column("pr").to_pylist() + [0.0, 0.5, 1.0] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -2795,6 +5437,23 @@ def cume_dist( For example:: cume_dist(order_by="points") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 10, 20]}) + >>> import builtins + >>> result = df.select( + ... dfn.col("a"), + ... dfn.functions.cume_dist( + ... order_by="a" + ... ).alias("cd") + ... ) + >>> [builtins.round(x, 4) for x in + ... result.sort(dfn.col("a") + ... ).collect_column("cd").to_pylist()] + [0.6667, 0.6667, 1.0] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -2839,6 +5498,16 @@ def ntile( For example:: ntile(3, order_by="points") + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30, 40]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.ntile(2, order_by="a").alias("nt")) + >>> result.sort(dfn.col("a")).collect_column("nt").to_pylist() + [1, 1, 2, 2] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -2877,6 +5546,17 @@ def string_agg( For example:: df.aggregate([], string_agg(col("a"), ",", order_by="b")) + + Examples: + --------- + >>> import datafusion as dfn + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["x", "y", "z"]}) + >>> result = df.aggregate( + ... [], [dfn.functions.string_agg(dfn.col("a"), ",", order_by="a").alias("s")]) + >>> result = result + >>> result.collect_column("s")[0].as_py() + 'x,y,z' """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2889,3 +5569,9 @@ def string_agg( order_by=order_by_raw, ) ) + + +if __name__ == "__main__": + import doctest + + doctest.testmod()