/*
================================================================================
pg_whoisactive - PostgreSQL Session Monitoring Tool
================================================================================
DESCRIPTION:
A comprehensive PostgreSQL inspired by Adam Machanic's sp_whoisactive stored
procedure used on SQL server. This function provides detailed real-time information about active
database sessions, including:
- Current executing queries and their duration
- Session blocking and lock information
- I/O statistics and resource usage
- Connection details and client information
- Transaction status and isolation levels
- Wait states and performance metrics
CAPABILITIES:
✓ Monitor active and idle database sessions
✓ Identify blocking chains and deadlock scenarios
✓ Track query execution times and resource consumption
✓ Display I/O statistics (reads, writes, temp usage)
✓ Show connection information (host, application, user)
✓ Filter sessions by various criteria
✓ Format output for easy analysis
✓ Help system with parameter documentation
PARAMETERS:
show_own_spid (BOOLEAN, default: FALSE)
- Include the current session in results
show_system_spids (BOOLEAN, default: FALSE)
- Include system/background processes
show_sleeping_spids (INTEGER, default: 1)
- 0: Show only active sessions
- 1: Show active + idle sessions with open transactions
- 2: Show all sessions (including completely idle)
get_full_inner_text (BOOLEAN, default: FALSE)
- TRUE: Display complete SQL query text
- FALSE: Truncate long queries to 100 characters
get_plans (BOOLEAN, default: FALSE)
- Reserved for future query plan integration
get_locks (BOOLEAN, default: FALSE)
- Include detailed lock information in wait_info column
get_avg_time (BOOLEAN, default: FALSE)
- Reserved for future average execution time tracking
get_additional_info (BOOLEAN, default: FALSE)
- Reserved for additional session metadata
find_block_leaders (BOOLEAN, default: TRUE)
- Identify and report blocking relationships between sessions
delta_interval (INTEGER, default: 5)
- Reserved for future delta calculations between samples
format_output (INTEGER, default: 1)
- 1: Normal output format
- 2: Reserved for detailed format
destination_table (TEXT, default: NULL)
- Reserved for future table output functionality
help (BOOLEAN, default: FALSE)
- Display help information and parameter descriptions
OUTPUT COLUMNS:
dd_hh_mm_ss - Session duration (DD:HH:MM:SS or HH:MM:SS)
session_id - PostgreSQL process ID (PID)
sql_text - Current SQL statement being executed
sql_command - SQL command type (SELECT, INSERT, UPDATE, etc.)
login_name - Database username
wait_info - Current wait state or blocking information
tran_log_writes - Transaction information and duration
cpu_time - CPU time used (reserved for future use)
tempdb_allocations - Temporary file count
tempdb_current - Current temporary space usage
blocking_session_id - PID of session blocking this one
blocked_session_count - Number of sessions this one is blocking
reads - Read I/O statistics
writes - Write I/O statistics
context_switches - Context switch information (reserved)
physical_io - Physical I/O metrics (reserved)
query_cost - Query execution cost (reserved)
status - Current session status
isolation_level - Transaction isolation level (reserved)
open_tran_count - Number of open transactions
percent_complete - Query completion percentage (reserved)
host_name - Client hostname or IP address
database_name - Current database name
program_name - Client application name
start_time - When the current query started
login_time - When the session was established
request_id - Request identifier (uses PID)
collection_time - Timestamp when data was collected
USAGE EXAMPLES:
-- Basic monitoring (most common use case)
SELECT * FROM pg_whoisactive();
-- Show all sessions including idle ones
SELECT * FROM pg_whoisactive(show_sleeping_spids := 2);
-- Focus on blocking issues
SELECT * FROM pg_whoisactive(find_block_leaders := true, get_locks := true)
WHERE blocking_session_id IS NOT NULL OR blocked_session_count > 0;
-- Detailed analysis of active sessions
SELECT * FROM pg_whoisactive(
show_sleeping_spids := 0,
get_full_inner_text := true,
get_locks := true
);
-- Monitor specific session
SELECT * FROM pg_whoisactive() WHERE session_id = 12345;
-- Get help
SELECT * FROM pg_whoisactive(help := true);
COMPANION FUNCTIONS:
pg_blocking_tree() - Shows hierarchical blocking relationships
COMPATIBILITY:
- PostgreSQL 9.6+
- Requires pg_stat_activity and pg_locks system views
- Works with all PostgreSQL editions
AUTHOR: Database Monitoring Tool
VERSION: 1.0
CREATED: 2024
================================================================================
*/
CREATE OR REPLACE FUNCTION public.pg_whoisactive(
show_own_spid BOOLEAN DEFAULT FALSE, -- Show current session
show_system_spids BOOLEAN DEFAULT FALSE, -- Show system processes
show_sleeping_spids INTEGER DEFAULT 1, -- 0=no idle, 1=sleeping with open trans, 2=all sleeping
get_full_inner_text BOOLEAN DEFAULT FALSE, -- Show full query text
get_plans BOOLEAN DEFAULT FALSE, -- Get query plans (if available)
get_locks BOOLEAN DEFAULT FALSE, -- Show lock information
get_avg_time BOOLEAN DEFAULT FALSE, -- Show average execution times
get_additional_info BOOLEAN DEFAULT FALSE, -- Show additional session info
find_block_leaders BOOLEAN DEFAULT TRUE, -- Identify blocking chain leaders
delta_interval INTEGER DEFAULT 5, -- Seconds between samples for delta calculations
format_output INTEGER DEFAULT 1, -- 1=normal, 2=detailed
destination_table TEXT DEFAULT NULL, -- Future: output to table
help BOOLEAN DEFAULT FALSE -- Show help information
)
RETURNS TABLE (
dd_hh_mm_ss TEXT, -- Session duration
session_id INTEGER, -- Process ID
sql_text TEXT, -- Current SQL statement
sql_command TEXT, -- Command type (SELECT, INSERT, etc.)
login_name TEXT, -- Username
wait_info TEXT, -- What the session is waiting for
tran_log_writes TEXT, -- Transaction info
cpu_time TEXT, -- CPU time (if available)
tempdb_allocations TEXT, -- Temp space usage
tempdb_current TEXT, -- Current temp usage
blocking_session_id INTEGER, -- Session that's blocking this one
blocked_session_count INTEGER, -- How many sessions this one is blocking
reads TEXT, -- Read statistics
writes TEXT, -- Write statistics
context_switches TEXT, -- Context switch info
physical_io TEXT, -- Physical I/O
query_cost TEXT, -- Query cost (if available)
status TEXT, -- Session status
isolation_level TEXT, -- Transaction isolation level
open_tran_count INTEGER, -- Number of open transactions
percent_complete TEXT, -- Progress (if available)
host_name TEXT, -- Client hostname/IP
database_name TEXT, -- Current database
program_name TEXT, -- Application name
start_time TIMESTAMP WITH TIME ZONE, -- When session started
login_time TIMESTAMP WITH TIME ZONE, -- When user logged in
request_id INTEGER, -- Request ID
collection_time TIMESTAMP WITH TIME ZONE -- When this data was collected
)
LANGUAGE plpgsql
AS $function$
-- pg_whoisactive - PostgreSQL equivalent of SQL Server's sp_whoisactive
-- A comprehensive session monitoring tool for PostgreSQL
DECLARE
rec RECORD;
blocking_info RECORD;
blocked_count INTEGER;
lock_info TEXT;
wait_description TEXT;
query_text TEXT;
cmd_type TEXT;
duration_formatted TEXT;
days INTEGER;
hours INTEGER;
minutes INTEGER;
seconds INTEGER;
total_seconds NUMERIC;
reads_value TEXT;
writes_value TEXT;
temp_files_value TEXT;
temp_bytes_value TEXT;
BEGIN
-- Show help if requested
IF help THEN
RETURN QUERY
SELECT
'HELP'::TEXT as dd_hh_mm_ss,
0 as session_id,
'pg_whoisactive - PostgreSQL equivalent of sp_whoisactive'::TEXT as sql_text,
'HELP'::TEXT as sql_command,
'show_own_spid: Include current session (default: false)'::TEXT as login_name,
'show_system_spids: Include system processes (default: false)'::TEXT as wait_info,
'show_sleeping_spids: 0=no idle, 1=idle with trans, 2=all idle (default: 1)'::TEXT as tran_log_writes,
'get_full_inner_text: Show complete SQL text (default: false)'::TEXT as cpu_time,
'get_locks: Include lock information (default: false)'::TEXT as tempdb_allocations,
'find_block_leaders: Identify blocking leaders (default: true)'::TEXT as tempdb_current,
NULL::INTEGER as blocking_session_id,
NULL::INTEGER as blocked_session_count,
'Usage: SELECT * FROM pg_whoisactive();'::TEXT as reads,
'Advanced: SELECT * FROM pg_whoisactive(show_sleeping_spids := 2, get_locks := true);'::TEXT as writes,
NULL::TEXT as context_switches,
NULL::TEXT as physical_io,
NULL::TEXT as query_cost,
NULL::TEXT as status,
NULL::TEXT as isolation_level,
NULL::INTEGER as open_tran_count,
NULL::TEXT as percent_complete,
NULL::TEXT as host_name,
NULL::TEXT as database_name,
NULL::TEXT as program_name,
NULL::TIMESTAMP WITH TIME ZONE as start_time,
NULL::TIMESTAMP WITH TIME ZONE as login_time,
NULL::INTEGER as request_id,
now() as collection_time;
RETURN;
END IF;
-- Main query logic
FOR rec IN
SELECT
a.pid,
a.usename,
a.application_name,
a.client_addr,
a.client_hostname,
a.backend_start,
a.xact_start,
a.query_start,
a.state_change,
a.state,
a.query,
a.datname,
a.backend_type,
-- Get I/O stats if available
COALESCE(s.blks_read, 0) as blks_read,
COALESCE(s.blks_hit, 0) as blks_hit,
COALESCE(s.tup_returned, 0) as tup_returned,
COALESCE(s.tup_fetched, 0) as tup_fetched,
COALESCE(s.tup_inserted, 0) as tup_inserted,
COALESCE(s.tup_updated, 0) as tup_updated,
COALESCE(s.tup_deleted, 0) as tup_deleted,
COALESCE(s.temp_files, 0) as temp_files,
COALESCE(s.temp_bytes, 0) as temp_bytes
FROM pg_stat_activity a
LEFT JOIN pg_stat_database s ON s.datname = a.datname
WHERE
-- Filter current session
(show_own_spid = TRUE OR a.pid <> pg_backend_pid())
-- Filter system processes
AND (show_system_spids = TRUE OR (a.datname IS NOT NULL AND a.backend_type = 'client backend'))
-- Filter idle connections based on show_sleeping_spids parameter
AND (
show_sleeping_spids = 2 OR -- Show all
(show_sleeping_spids = 1 AND (a.state <> 'idle' OR a.xact_start IS NOT NULL)) OR -- Show active or idle in transaction
(show_sleeping_spids = 0 AND a.state = 'active') -- Show only active
)
ORDER BY
CASE WHEN a.state = 'active' THEN 1 ELSE 2 END, -- Active sessions first
a.query_start DESC NULLS LAST
LOOP
-- Calculate and format session duration
total_seconds := EXTRACT(EPOCH FROM (COALESCE(now() - rec.backend_start, INTERVAL '0')));
days := FLOOR(total_seconds / 86400);
hours := FLOOR((total_seconds % 86400) / 3600);
minutes := FLOOR((total_seconds % 3600) / 60);
seconds := FLOOR(total_seconds % 60);
IF days > 0 THEN
duration_formatted := LPAD(days::TEXT, 2, '0') || ':' || LPAD(hours::TEXT, 2, '0') || ':' || LPAD(minutes::TEXT, 2, '0') || ':' || LPAD(seconds::TEXT, 2, '0');
ELSE
duration_formatted := LPAD(hours::TEXT, 2, '0') || ':' || LPAD(minutes::TEXT, 2, '0') || ':' || LPAD(seconds::TEXT, 2, '0');
END IF;
-- Process SQL text
IF get_full_inner_text THEN
query_text := rec.query;
ELSE
query_text := CASE
WHEN rec.query IS NULL THEN NULL
WHEN LENGTH(rec.query) > 100 THEN LEFT(rec.query, 97) || '...'
ELSE rec.query
END;
END IF;
-- Extract command type
IF rec.query IS NOT NULL AND TRIM(rec.query) <> '' THEN
cmd_type := UPPER(SPLIT_PART(TRIM(rec.query), ' ', 1));
IF cmd_type = '' THEN cmd_type := NULL; END IF;
ELSE
cmd_type := NULL;
END IF;
-- Determine wait info and status
wait_description := CASE
WHEN rec.state = 'active' THEN 'RUNNABLE'
WHEN rec.state = 'idle' THEN 'IDLE'
WHEN rec.state = 'idle in transaction' THEN 'IDLE IN TRANSACTION'
WHEN rec.state = 'idle in transaction (aborted)' THEN 'IDLE IN TRANSACTION (ABORTED)'
WHEN rec.state = 'fastpath function call' THEN 'FASTPATH FUNCTION CALL'
WHEN rec.state = 'disabled' THEN 'DISABLED'
ELSE UPPER(rec.state)
END;
-- Find blocking information if requested
IF find_block_leaders THEN
-- Check if this session is blocked
SELECT b.blocking_pid INTO blocking_info
FROM (
SELECT
blocked_locks.pid as blocked_pid,
blocking_locks.pid as blocking_pid
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_locks blocking_locks ON (
blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.virtualtransaction IS NOT DISTINCT FROM blocked_locks.virtualtransaction
)
WHERE NOT blocked_locks.granted
AND blocking_locks.granted
AND blocked_locks.pid <> blocking_locks.pid
) b
WHERE b.blocked_pid = rec.pid;
-- Count how many sessions this one is blocking
SELECT COUNT(*) INTO blocked_count
FROM (
SELECT
blocked_locks.pid as blocked_pid,
blocking_locks.pid as blocking_pid
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_locks blocking_locks ON (
blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.virtualtransaction IS NOT DISTINCT FROM blocked_locks.virtualtransaction
)
WHERE NOT blocked_locks.granted
AND blocking_locks.granted
AND blocked_locks.pid <> blocking_locks.pid
) b
WHERE b.blocking_pid = rec.pid;
ELSE
blocking_info := NULL;
blocked_count := 0;
END IF;
-- Get lock information if requested
IF get_locks THEN
SELECT STRING_AGG(
l.locktype || ':' || l.mode ||
CASE WHEN l.granted THEN ':GRANTED' ELSE ':WAITING' END,
', '
) INTO lock_info
FROM pg_locks l
WHERE l.pid = rec.pid;
ELSE
lock_info := NULL;
END IF;
-- Format read/write statistics
reads_value := CASE
WHEN rec.blks_read > 0 OR rec.blks_hit > 0 THEN
'Logical: ' || (rec.blks_read + rec.blks_hit)::TEXT || ', Physical: ' || rec.blks_read::TEXT
ELSE NULL
END;
writes_value := CASE
WHEN (rec.tup_inserted + rec.tup_updated + rec.tup_deleted) > 0 THEN
'Rows: ' || (rec.tup_inserted + rec.tup_updated + rec.tup_deleted)::TEXT
ELSE NULL
END;
-- Format temp usage
temp_files_value := CASE
WHEN rec.temp_files > 0 THEN rec.temp_files::TEXT || ' files'
ELSE NULL
END;
temp_bytes_value := CASE
WHEN rec.temp_bytes > 0 THEN pg_size_pretty(rec.temp_bytes)
ELSE NULL
END;
-- Return the row
RETURN QUERY
SELECT
duration_formatted, -- dd_hh_mm_ss
rec.pid, -- session_id
query_text, -- sql_text
cmd_type, -- sql_command
rec.usename::TEXT, -- login_name
CASE
WHEN blocking_info.blocking_pid IS NOT NULL THEN
'BLOCKED BY: ' || blocking_info.blocking_pid::TEXT
WHEN get_locks AND lock_info IS NOT NULL THEN lock_info
ELSE wait_description
END, -- wait_info
CASE
WHEN rec.xact_start IS NOT NULL THEN
'Open: ' || EXTRACT(EPOCH FROM (now() - rec.xact_start))::INTEGER::TEXT || 's'
ELSE NULL
END, -- tran_log_writes
NULL::TEXT, -- cpu_time (not available in PostgreSQL)
temp_files_value, -- tempdb_allocations
temp_bytes_value, -- tempdb_current
blocking_info.blocking_pid, -- blocking_session_id
CASE WHEN blocked_count > 0 THEN blocked_count ELSE NULL END, -- blocked_session_count
reads_value, -- reads
writes_value, -- writes
NULL::TEXT, -- context_switches (not available)
NULL::TEXT, -- physical_io
NULL::TEXT, -- query_cost (would need EXPLAIN)
wait_description, -- status
NULL::TEXT, -- isolation_level (would need transaction info)
CASE WHEN rec.xact_start IS NOT NULL THEN 1 ELSE 0 END, -- open_tran_count
NULL::TEXT, -- percent_complete (not available)
COALESCE(rec.client_hostname, rec.client_addr::TEXT, 'local'), -- host_name
rec.datname::TEXT, -- database_name
COALESCE(rec.application_name::TEXT, 'unknown'), -- program_name
rec.query_start, -- start_time
rec.backend_start, -- login_time
rec.pid, -- request_id (using pid)
now(); -- collection_time
END LOOP;
END;
$function$;
-- Create a helper function for quick blocking analysis
CREATE OR REPLACE FUNCTION pg_blocking_tree()
RETURNS TABLE (
level INTEGER,
blocking_pid INTEGER,
blocked_pid INTEGER,
blocked_user TEXT,
blocked_query TEXT,
blocked_duration TEXT
)
LANGUAGE sql
AS $$
/*
================================================================================
pg_blocking_tree - Hierarchical Blocking Chain Analysis
================================================================================
DESCRIPTION:
Displays blocking relationships in a hierarchical tree format, showing
which sessions are blocking others and the chain of dependencies.
RETURNS:
level - Depth in the blocking hierarchy
blocking_pid - Session causing the block
blocked_pid - Session being blocked
blocked_user - Username of blocked session
blocked_query - SQL being blocked
blocked_duration - How long the session has been blocked
USAGE:
SELECT * FROM pg_blocking_tree();
================================================================================
*/
WITH RECURSIVE blocking_tree AS (
-- Find the root blockers (sessions that block others but aren't blocked themselves)
SELECT
1 as level,
blocking_locks.pid as blocking_pid,
blocked_locks.pid as blocked_pid,
blocked_activity.usename as blocked_user,
blocked_activity.query as blocked_query,
EXTRACT(EPOCH FROM (now() - blocked_activity.query_start))::INTEGER::TEXT || 's' as blocked_duration
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON (
blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.virtualtransaction IS NOT DISTINCT FROM blocked_locks.virtualtransaction
)
WHERE NOT blocked_locks.granted
AND blocking_locks.granted
AND blocked_locks.pid <> blocking_locks.pid
)
SELECT * FROM blocking_tree
ORDER BY level, blocking_pid, blocked_pid;
$$;
/*
================================================================================
USAGE EXAMPLES AND REFERENCE
================================================================================
-- BASIC MONITORING
--------------------------------------------------------------------------------
-- View all active sessions (most common usage)
SELECT * FROM pg_whoisactive();
-- Show only currently executing queries
SELECT * FROM pg_whoisactive(show_sleeping_spids := 0);
-- Include all sessions (even completely idle ones)
SELECT * FROM pg_whoisactive(show_sleeping_spids := 2);
-- BLOCKING ANALYSIS
--------------------------------------------------------------------------------
-- Quick check for blocking issues
SELECT * FROM pg_whoisactive(find_block_leaders := true)
WHERE blocking_session_id IS NOT NULL OR blocked_session_count > 0;
-- Detailed blocking analysis with lock information
SELECT session_id, login_name, sql_text, wait_info, blocking_session_id, blocked_session_count
FROM pg_whoisactive(get_locks := true, find_block_leaders := true)
WHERE blocking_session_id IS NOT NULL OR blocked_session_count > 0;
-- Show complete blocking hierarchy
SELECT * FROM pg_blocking_tree();
-- DETAILED SESSION ANALYSIS
--------------------------------------------------------------------------------
-- Full query text and detailed information
SELECT * FROM pg_whoisactive(
get_full_inner_text := true,
get_locks := true,
find_block_leaders := true
);
-- Monitor specific session by PID
SELECT * FROM pg_whoisactive() WHERE session_id = 12345;
-- Long-running transactions
SELECT session_id, login_name, sql_command, tran_log_writes, dd_hh_mm_ss
FROM pg_whoisactive()
WHERE tran_log_writes IS NOT NULL
ORDER BY dd_hh_mm_ss DESC;
-- PERFORMANCE MONITORING
--------------------------------------------------------------------------------
-- Sessions with high I/O activity
SELECT session_id, login_name, sql_command, reads, writes, tempdb_current
FROM pg_whoisactive()
WHERE reads IS NOT NULL OR writes IS NOT NULL OR tempdb_current IS NOT NULL;
-- Currently active sessions sorted by duration
SELECT session_id, dd_hh_mm_ss, login_name, sql_text, status
FROM pg_whoisactive(show_sleeping_spids := 0)
ORDER BY dd_hh_mm_ss DESC;
-- TROUBLESHOOTING QUERIES
--------------------------------------------------------------------------------
-- Find sessions from specific application
SELECT * FROM pg_whoisactive() WHERE program_name LIKE '%myapp%';
-- Find sessions from specific user
SELECT * FROM pg_whoisactive() WHERE login_name = 'problem_user';
-- Find sessions connected from specific host
SELECT * FROM pg_whoisactive() WHERE host_name LIKE '%problemhost%';
-- Sessions in specific database
SELECT * FROM pg_whoisactive() WHERE database_name = 'production_db';
-- ADMINISTRATIVE TASKS
--------------------------------------------------------------------------------
-- Help and parameter reference
SELECT * FROM pg_whoisactive(help := true);
-- Include system processes for maintenance operations
SELECT * FROM pg_whoisactive(show_system_spids := true);
-- Monitor your own session activity
SELECT * FROM pg_whoisactive(show_own_spid := true) WHERE session_id = pg_backend_pid();
================================================================================
*/
Tag: database
How do you check if logical replication is running on a postgres instance?
- Check if logical replication is enabled in the PostgreSQL configuration file:Open the PostgreSQL configuration file (
postgresql.conf), usually located in the PostgreSQL data directory. Look for thewal_levelparameter. Logical replication requires at leastwal_levelto be set tologicalorreplica. If it’s set tological, then logical replication is enabled. You can find the location of the PostgreSQL data directory by querying thepg_settingssystem catalog table:SELECT name, setting FROM pg_settings WHERE name = 'data_directory'; - Check if logical replication slots are being used:Logical replication uses replication slots to keep track of the status of replication. You can check if any replication slots are in use by querying the
pg_replication_slotssystem catalog table:SELECT slot_name, plugin, slot_type, database FROM pg_replication_slots;If there are slots withslot_typeaslogical, then logical replication is being used. - Check if there are any active subscriptions:Logical replication involves publishers and subscribers. Publishers publish changes to the replication stream, and subscribers consume these changes. To see if there are any active subscriptions, you can query the
pg_subscriptionsystem catalog table:SELECT * FROM pg_subscription;If there are subscriptions listed here, then logical replication is in use. - Check if there are any replication connections:Logical replication typically involves connections from publishers to subscribers. You can check if there are any active replication connections by querying the
pg_stat_replicationsystem view:SELECT * FROM pg_stat_replication;If there are entries listed here, then logical replication connections are active.
By following these steps, you can determine if logical replication is running on your PostgreSQL instance.
UTF8 and MySQL
(2012-05-12 update: Please replace all mentions of “utf8” with “utf8mb4” to get a full utf8 character set. utf8_general_ci is just a half-assed implementation of utf8. Similarly, replace all representations of utf8_xxxxxx_xx collations with their utf8mb4 variant. In for a penny, in for a pound. NO ONE wants to do two utf8 migrations, ever. The methodology remains the same.)
If ever there was a TL;DR, this is it. However, it is important to understand the “whys” in my world, so here goes.
A full discussion of character sets and encoding is beyond the scope of this document. (If you want more background, I recommend checking out the wikipedia article for a good place to start here: http://en.wikipedia.org/wiki/Character_encoding ) However, a basic understanding of character set encoding technology and the specific character sets covered in this document is going to be required in order for the problem description and subsequent solution to make sense.
US-ASCII and Latin-1 Character Sets
In the early days of computing the ASCII character set became the industry standard that most computer systems used for representing characters. It’s a 7-bit encoding scheme which defines all the most commonly used characters in English text, as well as some non-printable control characters.
Latin-1 (ISO-8859-1) is a slight extension of US-ASCII in that it is 8-bit (ie. each character is exactly one byte or octet), and defines some characters used by non-English languages which nevertheless share much of the character set with English. (This includes many European languages which employ accented characters and the like.) Latin-1 also defines several other commonly-used symbols and glyphs. Latin-1 is one of several 8-bit extensions to ASCII, and the one most commonly in use as a default by US computers.
The major drawback of using Latin-1 is that it has only 256 possible characters, and therefore has no way of effectively representing characters in languages whose alphabets differ significantly from English (this includes languages like Russian, Greek, Hebrew, Arabic, almost all Asian languages and just about every language ever invented by humans). However, Latin-1 and US-ASCII remain the most commonly used character sets for all computer-related functions. That is, except for web pages.
UTF-8 Character Set
In 2008, UTF-8 surpassed ASCII as the most commonly used character encoding in use on the World Wide Web (source: http://en.wikipedia.org/wiki/ASCII ). Adoption of this character encoding scheme seems to be a growing trend for most applications in the information technology industry. A likely reason for this is because it is possible to represent the alphabet of nearly every written language ever invented using just this character set– which really helps when designing software that is meant to have an international user-base. (The thought here is that your non-English-speaking users will prefer to represent words in their native languages using characters from their native alphabets.)
UTF-8 is a variable length multi-byte character encoding scheme in that each character may be represented using 1, 2, 3 or 4 bytes or octets of data. The total possible size of this character set is around 10 million characters, with around 1 million defined so far. It can be used to encode any Unicode character, including the characters in nearly every language known to humankind, as well as a bunch of symbols and other glyphs that aren’t letters at all.
The main advantage of using UTF-8 over other multi-byte character encoding schemes (which can also encode Unicode characters) is that UTF-8 is backwards-compatible with ASCII. That is, a valid series of ASCII bytes is also a valid set of UTF-8 bytes and represents the exact same characters.
One obvious advantage to using UTF-8 instead of Latin-1 or ASCII (especially for web pages, where international appeal is usually desired) is that once you’ve converted over to using UTF-8, you really don’t have to worry about character set encoding issues again: For just about any human language that has an alphabet or character set, there will be a valid UTF-8 encoding of those characters. (That’s not to say there aren’t other caveats and implications about which you need to be aware when using UTF-8, but we’ll get to those a little later on.) Internationalization of your software becomes much less of a problem when using UTF-8.
For more good background information on UTF-8, check out its wikipedia article.
MySQL and Character Sets
MySQL by default uses the Latin-1 character set for all entities within the software or its data sets. In recent years, using UTF-8 has become a viable (and recommended) alternative, but probably in order not to confuse people with changing behavior, the developers of MySQL have decided to leave the default at Latin-1 for now. I believe this will change with MySQL version 6 (which seems like a good time to make everyone go through the task of converting from Latin-1 to UTF-8).
MySQL has the ability to convert characters between Latin-1 and UTF-8 and will, in fact, do this transparently in many cases. As it turns out, this can actually be a problem which can lead to the exact sort of character set hell this document tries to help the reader recover from.
In terms of the way MySQL stores textual fields (like CHARs, VARCHARs, and TEXT columns)– as far as I’ve been able to determine, at the raw storage level there is no difference between one of these fields and a BLOB (which is just raw binary data), except that text fields have a “collation” applied to them. Documentation on what this means is somewhat lacking, but as far as I can tell, “collation” in this case doesn’t exactly mean the character set in use in the text fields in the table so much as how MySQL will attempt to interpret the binary data stored in there. This means that it is possible to get ourselves into interesting situations where MySQL will store UTF-8 encoded characters in a table with Latin-1 collation. Also, since not every sequence of bytes is a valid string of UTF-8 characters, it’s possible to have invalid characters in a UTF-8 collated table.
Through experimentation I’ve been able to determine that if you have a Latin-1 collated table with UTF-8 encoded characters in it, then attempting to take the most direct approach to fix this and simply alter the collation from Latin-1 to UTF-8 on the table causes MySQL to “helpfully” convert the characters in the text fields in the table from Latin-1 to UTF-8 for you. The conversion algorithm, unfortunately, is not intelligent enough to figure out when there already are perfectly valid UTF-8 characters in the text fields and instead tries to convert each byte of a multi-byte character into its UTF-8 equivalent. This results in what I will call “double-encoded UTF-8 characters” in your text fields, and looks like a bunch of gobbledygook. Do this several times (usually in an attempt to fix the problem you’re actually just making worse) and you can have triple- and quadruple-encoded characters (which look like really long strings of gobbledygook which nevertheless have noticeable patterns in them). Fortunately this is all reversible.
Oh– and if this weren’t enough, MySQL can have different character encoding settings for the server, client, table, field, query results, replication, etc. What’s more, MySQL will transparently convert characters for you between character sets depending on how they’re used and accessed. (This can make troubleshooting these character set problems “interesting” to say the least.) Further, the terminal software or web server or web browser you’re using to access the data might also be using its own character set (and transparently converting characters for you)…
Why Should You Care?
You may be asking yourself, if you’re an English-speaking person writing English software which will only be used by an English-speaking user-base, why you should care about using the UTF-8 character encoding scheme instead of US-ASCII or Latin-1. It certainly seems like there are enough places where it’s easy to screw something up that it ought to be better to avoid the problem entirely and stick with Latin-1. There are several reasons you want to deal with this problem now:
- Again, the trend is that more and more software (especially that which touches the internet or has an international user-base) is being written using UTF-8 as the default character set. If you try to stay in the Latin-1 world, you will probably find that over the next couple of years the tools you use will be upgraded to using UTF-8 beneath you.
- It’s a bad idea to assume that English speaking users won’t try to enter UTF-8 characters into the text fields in your Latin-1 collated tables. (Ever see those “smart quotes?” Those are actually Unicode characters!) If this happens, especially if you don’t do character set validation yourself, you can end up in a situation where you do have UTF-8 encoded characters in your Latin-1 collated table (which means you’ve just landed in MySQL character set hell.)
- Having UTF-8 characters in your Latin-1 encoded tables means you can potentially create mysqldumps of your data which are not restore-able. This can happen transparently– mysqldump will create the dump with no errors or warnings, but you won’t know that the dump has syntax errors in it until you attempt to do a restore. (And this is usually exactly the time you don’t want to find out that your backups have serious problems.)
- The unfortunate nature of many of the problems associated with MySQL character set hell is that they can start happening with no indication that anything is wrong or that data is getting munged– until month or years later when you need to preserve the data you have, yet it’s a complete mess because of character encoding issues that have worked their way in there somehow.
Believe me: Even if you believe you’ll never need anything more than the 256 characters in the Latin-1 character set, it’s better to do it with UTF-8 now and save yourself the headache months or years from now.
The Problem
It’s actually somewhat difficult for me to define exactly what it is that quantifies MySQL character set hell. I’ve alluded to some of the problems that make it a nasty situation to be in above; But just to try to create a more complete list of the symptoms describing what I mean here:
- Some characters (specifically those not in the ASCII character set) show up as being a series of 2-4 characters of gobbledygook. This might be in the mysql command line client, in your application, or in any other tool accessing text fields in your database.
- Special accented characters are showing up on your web page or in your database as the “replacement character” () or question marks. Sometimes forcing the browser to treat the page as being encoded in UTF-8 will show you the characters as they’re supposed to be.
- A mysqldump you have made of your database is not restore-able. (The mysql command-line client complains about syntax errors in the dump.)
- You’ve recently updated one of the components of your application, and either legacy data in the database isn’t showing up right anymore, or new data you add isn’t showing up right anymore.
- You’ve just converted from Rails 1.2 to Rails 2.0 or later and suddenly those special characters are showing signs of corruption as has been detailed in this document.
Further, while the above can be fiendish problems the following secondary factors you may encounter when troubleshooting them can make it a truly hellish position to be in (and get out of):
- Using two different terminal programs, or connecting to the same database in different ways, the mysql client shows you what look like different results for the exact same query on the exact same data (where in one the data looks right, and in the other, the data looks corrupt– but only for non-ASCII characters).
- MySQL seems to resist your attempts to correct the double-encoding issue: That is, trying to fix the problem by altering a given table’s collation just seems to make the problem worse no matter what you do.
- If you have some characters that are double-encoded in your UTF-8 collated table, and some characters that aren’t, attempting to fix the double-encoded characters destroys the characters that were already right to begin with.
- If you have a very large data set, fixing the problem by manually editing a mysqldump, or by correcting each corrupt character individually is a losing proposition, even if you can find hexadecimal patterns to use in REGEX matching / replacement statements in SQL.
- The above point is especially true if you’ve got a lot of Chinese or Japanese characters (where there can be tens of thousands of possible characters that could be double-encoded).
- Most of the documentation I’ve been able to find on this seems to involve a very large amount of manually checking and manually correcting errors found in the data set. This again seems like a losing proposition.
- The MySQL documentation alludes to the fact that you don’t want to be in a position where you have two types of character encoding in the same table (see http://dev.mysql.com/doc/refman/5.1/en/charset-conversion.html ), but it does not suggest any course of action to correct this situation should it actually happen to you. Rather, it menacingly says such tables with columns like that “cannot be converted properly.”
- MySQL will “helpfully” transparently alter the character set being used in any given situation based on many different (and often hard to discover) settings. This can make it difficult, to say the least, to find out when and where character encoding conversions are happening, or what component in the system needs its configuration changed.
- MySQL uses its own process of character set conversion (more than likely in an attempt to be able to do conversions on invalid source data without aborting or disallowing the conversion entirely). Note especially that it does not work the same way as the standard gnu iconv utility.
- Attempts to fix double-encoded characters with iconv probably won’t work (depending on the data set and characters contained therein). Iconv appears to follow the RFC-compliant behavior of exiting with an error at the first improperly-encoded series of bytes it encounters instead of continuing with a warning. (And it will more than likely see these kinds of series of bytes, especially in a large, diverse and somewhat old dataset.) This is not what mysqld does. Furthermore, experimentation shows it doesn’t appear to catch and correct all instances of mysql double-encoding (among those data it actually will process).
… and the list goes on. The above are just some of the things I found attempting to resolve character encoding issues for one customer on one moderately-sized data set. I have no doubt there are more symptoms and stumbling blocks that can be encountered when dealing with MySQL Character Set Hell.
The Solution
If you’ve read this far, you’ve probably already gathered that the only quasi-permanent solution to the problem of MySQL Character Set Hell is to use the UTF-8 (and only the UTF-8) character encoding everywhere: It’s the direction all content on the internet is headed, and universal adoption on PCs and server systems probably won’t be too far behind. The rest of this document proceeds with the assumption that this is where you want to be at the end of the process. The procedure for getting there consists of a few steps:
- Gather your tools.
- Make everything everywhere speak UTF-8.
- Convert your Latin-1 collated tables to UTF-8.
- Clean up double- (and triple- and quadruple-) -encoded characters in your current data set.
- Fix any other residual problems in your data set left over from previous attempts at cleaning up the character set encoding issues.
Step 1: Gather your tools
The way I fixed the above issues was done mostly using standard linux terminal software and the mysql command-line client. Specifically, I used gnome-terminal under Fedora linux, however any terminal software which understands how to display the UTF-8 character set should work. If you have to tweak any of your terminal’s settings to get it to talk UTF-8 by default, you should probably do this. (The current version of gnome-terminal with Fedora speaks UTF-8 by default, so I didn’t have to do anything special here.)
Beyond that, it would be a good idea to consult the documentation for your application software (or the interpreter it uses, be that php, perl, rails, etc.) to determine how you force it to use UTF-8 as its default character set for all things MySQL related. (Since Blue Box Group specializes in Ruby on Rails, I’ll talk specifically about how to do this with Rails, but for other application platforms you’ll need to discover how to do this on your own.)
Note that these instructions do not cover using any kind of GUI to fix your MySQL character set issues: A GUI introduces another layer of indirection from the database, which can further complicate the troubleshooting process (and may introduce yet another place a character set conversion may take place). The goal here is to get MySQL talking UTF-8 at the most fundamental level and in my mind the most direct way to do this is with the mysql command-line client.
Also, these instructions assume you have complete administrative control over your own mysql database. If you run into MySQL Character Set Hell in a shared environment where you don’t have complete control over your databases, you’ll probably have to work with whoever does have administrative control of your database to get these problems fixed. Obviously, YMMV here.
Step 2: Make everything everywhere speak UTF-8
To configure both your mysql server daemon and client to speak UTF-8 by default, add lines like the following to every section of your my.cnf and restart your mysql daemon. If you use a mysql client from a different machine, make sure your client machine also has a my.cnf file which defines at least the [client] section and has the following configuration line:
default-character-set = utf8
You can verify both server and client are speaking UTF-8 by running the following query on the mysql command line (which should yield the following results):
mysql> show variables like 'char%';
+--------------------------+----------------------------+
| Variable_name | Value |
+--------------------------+----------------------------+
| character_set_client | utf8 |
| character_set_connection | utf8 |
| character_set_database | utf8 |
| character_set_filesystem | binary |
| character_set_results | utf8 |
| character_set_server | utf8 |
| character_set_system | utf8 |
| character_sets_dir | /usr/share/mysql/charsets/ |
+--------------------------+----------------------------+
8 rows in set (0.00 sec)
If you see latin1 mentioned in there anywhere, double-check your configuration, and / or make sure you’ve restarted your mysql daemon.
If you’re running Rails version 2.0 or later, this version of Rails uses the UTF-8 character encoding scheme by default for talking to the MySQL database. If you’re using Rails 1.2, you need to add the last line below to the appropriate sections of your database.yml file in order to get your application to speak UTF-8 to the database:
production:
adapter: mysql
database: mydatabase
username: mydbuser
password: mydbpassword
host: mydbhost
encoding: utf8
If you are using a different application platform, you’ll need to check its documentation to determine how to get your mysql connection to speak UTF-8. If you’ve written your own functions to do MySQL queries, you can often make sure these queries use UTF-8 by issuing the following query after opening each MySQL connection:
set names utf8;
And of course, to test your application to make sure that it’s really talking UTF-8 to the database, you can always execute and examine the results of the show variables like ‘char%’; query above.
Step 3: Convert your Latin-1 collated tables to UTF-8
Step 3A: Make a backup
Since this step is the first place we’re actually going to be changing data within the database, the very first thing we’ll want to do here is make a backup of the database. Note that since in MySQL Character Set Hell, a mysqldump of the database is likely to have syntax errors in it, I recommend making a backup of the raw database files. In order for this backup to have clean data in it, you’ll need to shut down your mysql daemon while making the backu If you’re server is a RedHat- or CentOS-like operating system using vendor-supplied RPMs for the MySQL packages (and default paths in your my.cnf), the following set of commands ought to do this for you:
# service mysql stop
# cd /var/lib
# tar zcvf mysql-backup.tar.gz mysql
# service mysql start
You’ll want to make sure your database server has enough space under /var/lib on it to store the above backup, of course.
Step 3B: Convert those tables
Please note that if your data set does not contain a mix of single- and double-encoded UTF-8 characters in a Latin-1 character set (ie. you either have one or the other) then there are safer conversion techniques than what I’m about to document below. (Specifically, see this page:). This step assumes that your data set contains tables with a mix of single- and double- (and triple- and quadruple-) encoded UTF-8 characters in a single Latin-1 collated table.
The procedure for converting your tables to UTF-8 collation will vary depending on which sources you’ll read, but the way that worked for me was to do a carefully-crafted mysqldump, alter the dump slightly, then re-import the dump back over the database itself. To minimize the chances of clobbering data in tables that are already properly UTF-8 collated, this really only needs to be done on tables using the Latin-1 collation to begin with. Do discover which tables these are, you can run:
mysql> show table status where collation like '%latin1%' \G
*************************** 1. row ***************************
Name: UserInfo
Engine: InnoDB
Version: 10
Row_format: Compact
Rows: 2
Avg_row_length: 8192
Data_length: 16384
Max_data_length: 0
Index_length: 65536
Data_free: 0
Auto_increment: 3
Create_time: 2008-06-01 22:48:11
Update_time: NULL
Check_time: NULL
Collation: latin1_swedish_ci
Checksum: NULL
Create_options:
Comment: InnoDB free: 0 kB
9 rows in set (0.00 sec)
mysql>
(Yes, “latin1_swedish_ci” is the name MySQL uses for Latin-1 collation.) You’ll need to run the above for all databases on the MySQL server, and compile a list of affected tables. For each of these tables, run:
# mysqldump -u dbuser -pdbpass --opt --skip-set-charset \
--default-character-set=latin1 --skip-extended-insert \
databasename --tables tablename > database.table.sql
# perl -i -pe 's/DEFAULT CHARSET=latin1/DEFAULT CHARSET=utf8/' \
database.table.sql
# cat database.table.sql | mysql -u dbuser -pdbpass databasename
Let me analyze what each of the above three commands does for you:
- On the mysqldump command, the –skip-set-charset and –default-char-set=latin1 options should prevent MySQL from taking the already-Latin-1-collated table and “helpfully” converting it to any other character set for you. This should ensure that your mysqldump is really in the Latin-1 character encoding scheme.The –skip-extended-insert option forces mysqldump to put each INSERT command in the dump on its own line. This will make the dump take much longer to re-import, however, in my experimentation, adding this option was enough to prevent the dump from having syntax errors in in anywhere. And in any case, should the re-import fail for any reason, having each row’s data on its own line really helps to be able to zero-in on which rows are causing you problems (and gives you easier options to work-around the problem rows).
- The perl command above takes your mysqldump and replaces the first instance of “DEFAULT CHARSET=latin1” and replaces it with “DEFAULT CHARSET=utf8”. Since your dump should only contain one table, the first instance of this string should occur within the CREATE TABLE definition toward the top of the dum Doing this change using a perl script both avoids any problems you might introduce by opening the dump in an editor (like, say, if it “helpfully” decides to wrap lines for you), and allows you to make this change on a dump that’s multiple-gigabytes in size.
- The third command re-imports the modified mysqldump back into the database. The new table should have the UTF-8 collation.
An astute observer here will note that we told mysql above to import a UTF-8 encoded dump that was actually encoded using Latin-1. In my experimentation, I found that this actually does seem to work, and I suspect this is because of the following reasons:
- If most of your data is in the ASCII character set (as it probably is), then the Latin-1 and UTF-8 representations of these series of bytes is actually binary equivalent. This means your rows that are purely ASCII will be unaffected by the above.
- For those rows with characters that are not ASCII in them, it appears the MySQL developers had already thought of the above conversion problems, and therefore wrote the mysql client to take a “best guess” approach to importing improperly-formatted data. Specifically, if in the above mysqldump, the mysql client gets handed rows with invalid UTF-8 characters in them, it appears mysql will assume that the characters are actually Latin-1 encoded and convert them to UTF-8 on the fly. Thank you MySQL developers! This is one place where that “helpful” automatic conversion really does help.Note that if you have any Latin-1 characters between the hexadecimal bytes 0xC2 and 0XF4 which are also followed by the right sequence of extended bytes thereafter… well, this can actually look like a properly-encoded UTF-8 character, so the mysql client’s best guess algorithm here is unlikely to know these are actually Latin-1 characters and will instead interpret them as the UTF-8 character those bytes appear to be. However, in practice, any such sequence of Latin-1 characters looks like gobbledygook anyway, so is unlikely to occur in a “real” data set. In the tens of gigabytes of data I converted for our new customer (which contained characters from at least a dozen languages including a lot of Chinese and Japanese) this never occurred, and it appears nothing got munged in the process. (We don’t really have a good way to detect munged characters in this case, but about 3 weeks after the conversion, nobody has reported any brokenness in the converted data.)
I should also point out here that other sources of documentation will recommend actually converting the mysqldump to the UTF-8 character set using the iconv utility. If you have a relatively clean dataset, this may work for you. For the data set I had to work with, it did not.
Anyway, before going any further in these instructions, it’s a good idea to check the collation on your converted tables:
mysql> show table status \G
*************************** 1. row ***************************
Name: UserInfo
Engine: InnoDB
Version: 10
Row_format: Compact
Rows: 2
Avg_row_length: 8192
Data_length: 16384
Max_data_length: 0
Index_length: 65536
Data_free: 0
Auto_increment: 3
Create_time: 2008-06-01 22:48:11
Update_time: NULL
Check_time: NULL
Collation: utf8_general_ci
Checksum: NULL
Create_options:
Comment: InnoDB free: 0 kB
9 rows in set (0.00 sec)
mysql>
Step 4: Clean up double-encoded characters in your current data set
Before going any further in these instructions, I highly recommend taking a close look at your application. At this point, everything everywhere should be speaking UTF-8, and all your tables should have the UTF-8 collation. In an ideal world, especially if you started with a relatively clean data set, this may be all you need to do to make everything peachy in your application. So take a moment to look at your application (and specifically content generated from rows with non-ASCII characters in them) and verify you really need to go further in this how-to.
Ok, so from here on out, we’re going to assume that you do still have problems displaying characters as they should be in your application, and that this is due to having double-encoded characters in your UTF-8 collated tables. (Note that if your terminal and mysql command-line client are speaking UTF-8, you can verify the problems lie with the data itself with a few SELECT statements targeted at rows containing the data.)
Caveats
As a few final notes before moving on just to drive the point home:
- *The remainder of these instructions will not work if you are not speaking UTF-8 everywhere, or if your tables are not using the UTF-8 collation.*
- If you have a relatively small number of rows affected by character encoding issues at this point, it may just be simpler and safer to fix these by hand.
- If you have a relatively small number of different types of double-encoded UTF-8 characters it may be simpler to convert these characters directly using techniques detailed by Derek Sivers in his excellent blog article on the same here: http://www.oreillynet.com/lpt/wlg/9022
- These instructions assume you won’t have single- and double-encoded UTF-8 characters in the same field of a single row of any given table. This is a relatively safe assumption, unless you’ve already partially-completed manually fixing the data as described by Derek Sivers in the above point. If you do have such fields in your data set, these instructions will definitely corrupt them. (ie. You’re better off either finishing your conversion using Derek’s techniques, or reverting the data back to the state they were in before attempting Derek’s techniques.)
Scope of these instructions
The instructions in this section of this document will detail how to fix double- (and triple- and quadruple-)-encoded character issues for one column in all rows of one table in the database. These steps will need to be repeated for any affected columns in all tables in your database. Scripting this is left as an exercise for the reader (or as an incentive to become a customer of Blue Box Group) )
For the examples I’ll be using below, we’ll use the following example name conventions. Obviously, you’ll need to replace these with the actual names that apply to your database:
- Table: mytable
- Field / Column: myfield
- Type of the above field / column: mytype
- Temporary table (name is arbitrarily chosen): temptable
Step 4A: Find tables with fields containing multi-byte characters in them
At this point, only those rows which have non-ASCII characters in text-containing fields (like CHAR, VARCHAR, TEXT, etc.) should have problems with double-encoded UTF-8 characters in them. The really nice part about UTF-8 here (and tables containing UTF-8 characters) is that any non-ASCII characters are by design multi-byte characters. We can exploit this fact to find tables and fields with rows containing non-ASCII characters.
mysql> select count(*) from mytable where LENGTH(myfield) !=
CHAR_LENGTH(myfield);
+----------+
| count(*) |
+----------+
| 669 |
+----------+
1 row in set (0.28 sec)
If the above query returns a count of zero, this means that either:
- There are no non-ASCII characters in any rows of the table for this field. Therefore, this field of this table does not need any correction and can be ignored.
- Your table is not using UTF-8 collation (which is bad, because if you’ve been following my instructions thus far it should be).
- Non-ASCII characters in this field in this table have already been munged by previous attempts at correcting character encoding issues and have been turned into single-byte characters (more than likely a question mark). If you have any rows with fields like this, the information needed to automatically convert them back into the multi-byte characters they should be has already been lost, and these rows will need to be fixed by hand.
Anyway, you will want to go through all text-like fields in all tables of your database and find any which contain rows that have multi-byte characters in them like this. These rows can potentially have double-encoded UTF-8 characters in them and will need conversion as detailed below. I suggest compiling these into a list you can automatically parse with a script which will do the remainder of the steps below.
*Also note that it is very important at this stage to make note of the field type and any associated parameters (eg. the length of the VARCHAR, etc.).* I suggest taking a close look at the table’s description to get this information.
Step 4B: Copy rows with multi-byte characters into a temporary table
Now that we know which fields in which tables potentially have double-encoded UTF-8 characters in them, copy just those rows into a temporary table. The goal here is to minimize the chances of blowing away large chunks of the data you want to keep, so we only need to run the remainder of our conversion steps on rows that are potential candidates for double-encoded UTF-8 characters:
mysql> create table temptable (select * from mytable where
LENGTH(myfield) != CHAR_LENGTH(myfield));
Query OK, 669 rows affected (1.15 sec)
Records: 669 Duplicates: 0 Warnings: 0
Step 4C: Convert double-encoded UTF-8 characters to proper UTF-8 characters
In order to do this, we’re actually going to convert the field from UTF-8 to Latin-1 and use a round-about way of changing it back to UTF-8 again. This may seem counter-intuitive, but I’ll explain why this works after listing the actual commands to run:
mysql> alter table temptable modify temptable.myfield mytype character
set latin1;
Query OK, 669 rows affected, 127 warnings (0.05 sec)
Records: 669 Duplicates: 0 Warnings: 0
mysql> alter table temptable modify temptable.myfield blob;
Query OK, 669 rows affected (0.17 sec)
Records: 669 Duplicates: 0 Warnings: 0
mysql> alter table temptable modify temptable.myfield mytype character
set utf8;
Query OK, 669 rows affected, 552 warnings (0.05 sec)
Records: 669 Duplicates: 0 Warnings: 0
Let me explain what the above does:
- The first alter table command tells mysql to convert the UTF-8 encoded characters into Latin-1. At first, this appears to be the exact opposite of what we were trying to accomplish above by making everything everywhere speak UTF-8. But please remember that by “double-encoded” UTF-8 characters, I really mean properly-encoded UTF-8 characters that were run through MySQL’s Latin-1 to UTF-8 conversion algorithm one too many times. By converting the UTF-8 collated field to Latin-1, this effectively tells MySQL to run those characters through the reverse algorithm: Convert UTF-8 encoded characters to Latin-1. This has the effect of taking those double-encoded UTF-8 characters and turning them into single-encoded or properly-encoded UTF-8 characters.So what about those rows which already had properly-encoded UTF-8 characters in them? Well, that’s what some of those warnings were. But we’ll get to that in the next step sub-part.
- The second alter table command just converts the textual field into a BLOB (or raw binary) field. This does no change to the actual binary data underneath and no conversion algorithm is run.
- The third alter table command takes that BLOB field and converts it back to a textual field with the UTF-8 encoding. Since MySQL has no algorithms defined for character conversion when going from a BLOB to any other data type, these last two steps effectively allow us to fool MySQL into converting the field from Latin-1 to UTF-8 collation, but skip the character set conversion algorithm that would otherwise re-double-encode our now pristine properly-encoded UTF-8 characters. Pretty sneaky, eh.
Step 4D: Remove rows with only single-byte characters from the temporary table
Ok, so about those rows that already had properly-encoded UTF-8 characters in them: If MySQL encounters a multi-byte character in UTF-8 encoding, it essentially has two options when converting this character to Latin-1:
- If a Latin-1 representation of that character exists (as is the case with characters like: ¶, Ü, ¿, £, etc.), then MySQL just converts the multi-byte UTF-8 character into its single-byte Latin-1 equivalent.
- If a Latin-1 representation of that character does not exist, MySQL converts the multi-byte UTF-8 character into a question mark (?), which is the closest thing Latin-1 has to the replacement character (). This also generates the warnings alluded to above, because we are effectively removing our ability to properly convert this character back from Latin-1 to UTF-8, and MySQL knows this.
There’s also another case I haven’t discussed yet which you might encounter here: If the series of bytes that should be the UTF-8 field contains any sequence of bytes that does not actually represent UTF-8 characters (ie. corrupt characters or starting data that was never Latin-1 or UTF-8 at all to begin with), then my experimentation seems to show that MySQL will truncate these fields at the first such sequence of bytes. This appears to be how it will also handle queries on tables with UTF-8 collation where the series of bytes actually stored in the fields do not describe a proper UTF-8 string. Anyway, the end result of this “corrupt starting data” is that in the end, MySQL will truncate it at the first erroneous byte sequence whenever it’s accessed.
And this is another key behavioral feature of MySQL’s character handling algorithms we can exploit: At the end of step 4C above, any fields which had already contained only properly-encoded UTF-8 characters in them, or which contained an erroneous sequence of bytes will be converted to strings which contain only single-byte characters.
Since these are exactly the fields we do not want to do any kind of conversion on, it’s easy enough now to exclude them from our list of rows in the temporary table to be fixed in the “real” table:
mysql> delete from temptable where LENGTH(myfield) = CHAR_LENGTH(myfield); Query OK, 549 rows affected (0.01 sec)
Step 4E: Re-insert fixed rows back into the original table
Since our temporary table should now include only those rows which had double-encoded UTF-8 characters in them, and since those rows have been “fixed” as it were, all we need to do now is to update said rows in the original table. Note that it would probably be a good idea to take a look at the “fixed” rows with a few select statements before you do this:
mysql> replace into mytable (select * from temptable); Query OK, 120 rows affected (0.05 sec) Records: 120 Duplicates: 0 Warnings: 0
Step 4F: Verify fixed rows are actually fixed
At this point, the field we just updated should contain only valid, properly-encoded UTF-8 characters in it (unless, of course, you had triple- and quadruple-encoded characters). Especially before you run a script to do steps 4B through 4E above for all your affected columns in all your affected tables, now would be a very good time to check how those affected characters from those fields look in your application, in selects on the mysql command line client, etc. to verify the fixes we did above actually did fix things.
Step 5: Fix any residual problems
It’s entirely possible at this point that there still might be some character set issues which have defied our above attempts at using MySQL’s character encoding conversion algorithms to fix. There are any number of ways this may have happened, especially if you have a rather old data set that has been used with several different application front-ends or has been moved across several different platforms or been touched by a lot of different developers who may have taken different approaches to dealing with the problem of MySQL Character Set Hell. The hope here is that by following the above steps we should have reduced the number of rows which need fixing to only a handful, where manually fixing them is no longer a losing proposition.
If you find yourself at this stage, I recommend reading through Derek Sivers’ article on doing this as one way to fix these lingering problems: http://www.oreillynet.com/lpt/wlg/9022
Some final notes on the conversion process
- If you have triple- or quadruple-encoded UTF-8 characters, to fix these, just run the data through steps 4B through 4E above multiple times. The conversion process above is designed to avoid munging properly-encoded UTF-8 characters and should be safe to run multiple times on the same data set.
- If you have any foreign key constraints, triggers, etc. which would make the above steps as literally detailed impossible, then you’ll need to alter your plan somewhat. Hopefully at this point you understand enough of how the process works in order to come up with a plan to work around such constraints.
- If you’ve read through all these instructions hoping to fix MySQL character set problems involving any other character sets than Latin-1 or UTF-8, I’m sorry to say these instructions may not help you: I’ve not tried the above with any other character sets, and have no idea whether MySQL’s behavioral quirks (which I’m exploiting to do this fix) work as well for other character encoding schemes as they do for UTF-8 and Latin-1.
- As usual, your mileage may vary. This document was written in the hope that it would save some DBAs or SysAdmins somewhere a lot of trouble and headache by documenting in detail what worked for us and why. No promises, guarantees, or warranties are implied in any way shape or form with this document and we take no responsibility for any kind of data, customer, hair, employment, or spousal corruption or loss as a result of following the advice contained herein.
A few final words on using UTF-8
Let’s see… I said at some point above that if you’re new to the UTF-8 world, there are other issues about which you need to be aware when using this character set encoding scheme. Here are a few major ones:
- In the early stages of UTF-8, there were a myriad of exploits and buffer-overruns found in dealing with UTF-8 encoded character sets. These largely came from the (at the time) wide-spread assumption that 1 byte = 1 character. While these problems have largely been worked through in your operating system and standard software suites, if you’re writing your own code which will be exposed to UTF-8 characters, you need to keep in mind that characters are not guaranteed to be 1 byte long each, and plan accordingly.
- Some characters in UTF-8 take up no space or even print to the right of the cursor when displayed on screen. This is by design as some languages’ alphabets are not meant to be read from left to right and have special ways of dealing with inflections, diphthongs, glottal stops, and other things you’d need to consult a linguist to understand. However, this can be somewhat surprising at first if you’re used to the English paradigm of everything you type appearing on the screen from left to right (especially if your terminal actually does know the right way to display these characters).
- If you have profanity filters or other character- or string-recognition-based anti-abuse measures in use on your site, UTF-8 presents some new challenges. Because of the vast number of characters in the UTF-8 character encoding scheme, it’s possible to take a character in another language which looks close enough to an English character to effectively side-step such filters. UTF-8 also opens the door for other less severe abuses of the character set like this: ¡?u?u?dd?? s,???? ??p? ou ???? ? This becomes even more of a problem if an abusive user decides to put abusive content on your site in a language you don’t speak (and therefore probably won’t realize is abusive).If you only really want to support English characters in your application, instead of avoiding the use of UTF-8, I would instead recommend still following the above instructions to make everything everywhere speak UTF-8, but run user input through a filter which allows only a subset of the UTF-8 character set to be used. This way you can continue to have working profanity, etc. filters while avoiding the problem of MySQL Character Set Hell.
- If you rely on user reports to detect such abuses in your application, if you allow your users (and abusers) to enter any characters they would like for their username or other identifying information, you may find that your legitimate users have a hard time reporting the abuses since they won’t know how to type this identifying information in an abuse report. (In other words, when it comes to user identification, it’s usually best to write your code to limit this to just characters that can be found in the ASCII character set.)
…and this list also goes on. Despite this, the more universal adoption of UTF-8 as the “standard” character set for nearly all human alphabets is probably a good thing. (And yes, your application is going to have to deal with these 21st century problems sometime– It’s better to deal with these sooner rather than later, eh.)
A conclusion (of sorts)
The one thing you’ll probably see in all the documentation you’ll find online about dealing with MySQL character set encoding issues is that it’s better to start with everything set up right from day one, and not have to go through a painful conversion. As they say, hindsight is better than foresight, and that definitely applies to the problems associated with MySQL character set encoding.
If the steps I’ve done my best to explain above still seem too difficult to follow, then the only thing I can say about this is that this is a situation where hiring an expert DBA, or a well-trained, experienced support team is money well spent. This is a deceptively difficult position to be in, and it’s a good idea to have people who know what they’re doing to help you get out of it.
In any case, if you’ve read this far, I thank you and wish you the best of luck in climbing out of MySQL Character Set Hell.
Customizing the MySQL CLI Prompt
Let us face it. The following mysql> prompt is boring. Nobody wants to see it. Let us change the default mysql> prompt to something functional and useful.
$ mysql -u root -pyour-password
Welcome to the MySQL monitor. Commands end with ; or \g.
Your MySQL connection id is 10
Server version: 5.1.25-rc-community MySQL Community Server (GPL)
mysql>
1. Display username, hostname and current database name in the mysql prompt
The MYSQL_PS1 in this example displays the following three information in the prompt:
- \u – Username
- \h – Hostname
- \d – Current mysql database
export MYSQL_PS1="\u@\h [\d]> "
$ mysql -u root -pyour-password -D sugarcrm
root@dev-db [sugarcrm]>
Now the mysql prompt doesn’t look that bad. does it? This prompt is more meaningful than the useless default “mysql>” prompt.
2. Change the mysql> prompt interactively
You can also change the mysql> prompt interactively from inside the mysql as shown below.
$ mysql -u root -pyour-password -D sugarcrm
mysql> prompt \u@\h [\d]>
PROMPT set to '\u@\h [\d]> '
root@dev-db [sugarcrm]>
3. Change the mysql> prompt from mysql command line
Instead of using the MYSQL_PS1 variable, you can also pass the prompt as an argument to the mysql command line as shown below.
$ mysql --prompt="\u@\h [\d]> " -u root -pyour-password -D sugarcrm
root@dev-db [sugarcrm]>
4. Display Current Time in the mysql> prompt
Use \D to display full date in the mysql prompt as shown below.
$ export MYSQL_PS1="\u@\h [\D]> "
$ mysql -u root -pyour-password -D sugarcrm
root@dev-db [Sat Dec 26 19:56:33 2009]>
5. Change the mysql> prompt using /etc/my.cnf or .my.cnf file
You can also use either the global /etc/my.cnf (or) your local ~/.my.cnf file to set the prompt as shown below.
$ vi ~/.my.cnf
[mysql]
prompt=\\u@\\h [\\d]>\\_
$ mysql -u root -pyour-password -D sugarcrm
root@dev-db [sugarcrm]>
6. Customize mysql> prompt any way you want it
Use the following variables and customize the mysql prompt as you see fit. These variables are somewhat similar to the Unix PS1 variables (but not exactly the same).
Generic variables:
- \S displays semicolon
- \’ displays single quote
- \” displays double quote
- \v displays server version
- \p displays port
- \\ displays backslash
- \n displays newline
- \t displays tab
- \ displays space (there is a space after \ )
- \d displays default database
- \h displays default host
- \_ displays space (there is a underscore after \ )
- \c displays a mysql statement counter. keeps increasing as you type commands.
- \u displays username
- \U displays username@hostname accountname
Date related variables:
- \D displays full current date (as shown in the above example)
- \w displays 3 letter day of the week (e.g. Mon)
- \y displays the two digit year
- \Y displays the four digit year
- \o displays month in number
- \O displays 3 letter month (e.g. Jan)
- \R displays current time in 24 HR format
- \r displays current time in 12 hour format
- \m displays the minutes
- \s displays the seconds
- \P displays AM or PM
Note: You can go back to the regular boring mysql> prompt at anytime by simply typing prompt in the mysql> prompt as shown below.
root@dev-db [sugarcrm]> prompt
Returning to default PROMPT of mysql>
mysql>