Skip to content

Common data cleaning and transforming tasks in SQL

https://www.datacamp.com/tutorial/cleaning-data-sql#!

https://towardsdatascience.com/cleaning-and-transforming-data-with-sql-f93c4de0d2fc

filtering out NULL


-- There are null values in column b
-- We want to remove those rows as they are invalid data
-- Query produces an output with NULL values from column be removed.

SELECT
    column_a,
    column_b,
    column_c,
FROM table_1
WHERE column_b IS NOT NULL

Changing column data types


-- Sometimes columns are not in an appropriate type.
-- For example, a numeric column might be string type within the database
--

SELECT
    column_a,
    CAST(column_b AS NUMERIC) AS column_b,
    column_c
FROM table_1

Changing column names


-- Sometimes columns are not named appropriately
-- Using `AS` when selecting columns can be used to rename
-- columns in the output results

SELECT
    column_a AS the_first_column
    column_b AS a_better_name_for_the_second_column,
    column_c AS clear_and_understandable_name
FROM table_1

Replacing strings

Identifying duplicate rows


SELECT COUNT(*)
FROM
    (SELECT
        column_a,
        column_b,
        column_c,
        COUNT(*) AS records
    FROM table_1
    GROUP BY
        column_a,
        column_b,
        column_c) a 
WHERE records > 1;


-- rewritten as CTE

WITH duplicate_count AS (
    SELECT
        column_a,
        column_b,
        column_c,
        COUNT(*) AS records
    FROM table_1
    GROUP BY
        column_a,
        column_b,
        column_c
)

-- count rows where duplicates have been found
SELECT COUNT(*)
FROM duplicate_count
WHERE records > 1;