MDEV-39227: Schema Versioning - Server-side#5127
Conversation
- SQL layer infrastructure for multiple versions of a table definition. - No storage engine changes; any DDLs that depends on the engine or .frm re-write for persistance is not yet supported. - binlog/replication, DDL log for crash recovery are not supported yet.
There was a problem hiding this comment.
Code Review
This pull request introduces multi-version support for the Table Definition Cache (TDC) to enable lock-free metadata-only ALTER TABLE operations (specifically ALTER COMMENT). It achieves this by introducing a TABLE_SHARE_VERSION structure, allowing in-flight transactions to remain bound to older schema versions while new connections receive the updated version. The code review identified several critical issues: a potential memory leak where idle TDC_element structures are not returned to the unused LRU list when an older version is garbage collected, a security vulnerability where a buffer overflow could occur in production due to relying solely on a DBUG_ASSERT for key length validation, a potential crash/cleanup bug in the bound fast path when jumping to the err label without holding the expected pins, and an opportunity to optimize redundant calls to element->current().
| if (v != e->versions_tail) | ||
| { | ||
| --v->ref_count; | ||
| if (!share->is_view) | ||
| mysql_cond_broadcast(&e->COND_release); | ||
| if (v->ref_count == 0) | ||
| tdc_gc_version(e, v); | ||
| mysql_mutex_unlock(&e->LOCK_table_share); | ||
| DBUG_VOID_RETURN; | ||
| } |
There was a problem hiding this comment.
When an older version's reference count drops to 0 and it is garbage collected via tdc_gc_version(), the chain might be reduced to only holding the current version (versions_tail). If the current version's reference count is already 0, the entire TDC_element is now completely idle. However, because tdc_gc_version() does not check for this or put the element on the unused_shares LRU (and we skipped doing so when the current version's ref count originally hit 0 because the chain wasn't of length 1 yet), the element will leak in the TDC hash forever and never be purged.
To fix this, we should check if the chain has become length 1 with a ref count of 0 after GC. Since we need LOCK_unused_shares to put it on the LRU or delete it, we must temporarily increment the current version's ref count to prevent concurrent deletion, unlock LOCK_table_share, acquire the locks in the correct order, decrement the ref count, and then safely handle the idle element.
if (v != e->versions_tail)
{
--v->ref_count;
if (!share->is_view)
mysql_cond_broadcast(&e->COND_release);
if (v->ref_count == 0)
{
tdc_gc_version(e, v);
/*
If the chain now has only one version and its ref_count is 0,
the element is truly idle. We must put it on the unused_shares LRU
or delete it if flushed, otherwise it will leak in the hash forever.
Since we need LOCK_unused_shares, we temporarily pin the element
to prevent deletion, unlock LOCK_table_share, and acquire locks in
the correct order.
*/
if (e->versions_head == e->versions_tail && e->versions_tail->ref_count == 0)
{
e->versions_tail->ref_count++;
mysql_mutex_unlock(&e->LOCK_table_share);
mysql_mutex_lock(&LOCK_unused_shares);
mysql_mutex_lock(&e->LOCK_table_share);
e->versions_tail->ref_count--;
if (e->versions_head == e->versions_tail &&
e->versions_tail->ref_count == 0)
{
TABLE_SHARE_VERSION *tail= e->versions_tail;
if (tail->flushed || tdc_records() > tdc_size)
{
mysql_mutex_unlock(&LOCK_unused_shares);
tdc_delete_share_from_hash(e);
DBUG_VOID_RETURN;
}
DBUG_ASSERT(e->next == 0);
unused_shares.push_back(e);
}
mysql_mutex_unlock(&e->LOCK_table_share);
mysql_mutex_unlock(&LOCK_unused_shares);
DBUG_VOID_RETURN;
}
}
mysql_mutex_unlock(&e->LOCK_table_share);
DBUG_VOID_RETURN;
}| DBUG_ASSERT(key_length <= sizeof(b->key)); | ||
| memcpy(b->key, key, key_length); |
There was a problem hiding this comment.
Relying solely on DBUG_ASSERT to protect against buffer overflow is unsafe because assertions are compiled out in release builds. If key_length ever exceeds sizeof(b->key) in a production environment, memcpy will cause a buffer overflow. Adding a runtime check ensures robustness and security in all build configurations.
if (key_length > sizeof(b->key))
{
my_free(b);
return true;
}
memcpy(b->key, key, key_length);| bool any_in_use= false; | ||
| if (element->current()) | ||
| { | ||
| for (TABLE_SHARE_VERSION *v= element->versions_head; v; v= v->next) | ||
| { | ||
| if (v->ref_count > 0) | ||
| { | ||
| any_in_use= true; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| if (any_in_use && !element->current()->share->is_view) | ||
| { | ||
| DBUG_ASSERT(element->share); | ||
| DBUG_ASSERT(element->current()->share); | ||
| bool do_flush= 0; | ||
| switch (arg->flush_type) { | ||
| case FLUSH_ALL: | ||
| do_flush= 1; | ||
| break; | ||
| case FLUSH_NON_TRANS_TABLES: | ||
| if (!element->share->online_backup && | ||
| element->share->table_category == TABLE_CATEGORY_USER) | ||
| if (!element->current()->share->online_backup && | ||
| element->current()->share->table_category == TABLE_CATEGORY_USER) | ||
| do_flush= 1; | ||
| break; | ||
| case FLUSH_SYS_TABLES: | ||
| if (!element->share->online_backup && | ||
| element->share->table_category != TABLE_CATEGORY_USER) | ||
| if (!element->current()->share->online_backup && | ||
| element->current()->share->table_category != TABLE_CATEGORY_USER) | ||
| do_flush= 1; | ||
| } | ||
| if (do_flush) | ||
| { | ||
| element->ref_count++; // Protect against delete | ||
| if (push_dynamic(shares, (uchar*) &element->share)) | ||
| element->current()->ref_count++; // Protect against delete | ||
| if (push_dynamic(shares, (uchar*) &element->current()->share)) | ||
| result= TRUE; | ||
| } |
There was a problem hiding this comment.
Calling element->current() multiple times (8 times in this block) is redundant and slightly inefficient. Storing the result in a local variable cur at the beginning of the block simplifies the code and improves readability and performance.
TABLE_SHARE_VERSION *cur= element->current();
bool any_in_use= false;
if (cur)
{
for (TABLE_SHARE_VERSION *v= element->versions_head; v; v= v->next)
{
if (v->ref_count > 0)
{
any_in_use= true;
break;
}
}
}
if (any_in_use && !cur->share->is_view)
{
DBUG_ASSERT(cur->share);
bool do_flush= 0;
switch (arg->flush_type) {
case FLUSH_ALL:
do_flush= 1;
break;
case FLUSH_NON_TRANS_TABLES:
if (!cur->share->online_backup &&
cur->share->table_category == TABLE_CATEGORY_USER)
do_flush= 1;
break;
case FLUSH_SYS_TABLES:
if (!cur->share->online_backup &&
cur->share->table_category != TABLE_CATEGORY_USER)
do_flush= 1;
}
if (do_flush)
{
cur->ref_count++; // Protect against delete
if (push_dynamic(shares, (uchar*) &cur->share))
result= TRUE;
}
}| mysql_mutex_lock(&element->LOCK_table_share); | ||
| if (unlikely(share->error)) | ||
| { | ||
| open_table_error(share, share->error, share->open_errno); | ||
| goto err; | ||
| } | ||
| if (share->is_view && !(flags & GTS_VIEW)) | ||
| { | ||
| open_table_error(share, OPEN_FRM_NOT_A_TABLE, ENOENT); | ||
| goto err; | ||
| } | ||
| if (!share->is_view && !(flags & GTS_TABLE)) | ||
| { | ||
| open_table_error(share, OPEN_FRM_NOT_A_VIEW, ENOENT); | ||
| goto err; | ||
| } |
There was a problem hiding this comment.
In the bound fast path, we bypass the lf_hash_search lookup and therefore do not pin the element. Jumping to err: on error is risky because err: might assume that the element was pinned and attempt to unpin it (or perform other cleanup associated with the slow path). It is much safer and cleaner to make the bound path self-contained by unlocking the mutex and returning 0 directly on error.
mysql_mutex_lock(&element->LOCK_table_share);
if (unlikely(share->error))
{
open_table_error(share, share->error, share->open_errno);
mysql_mutex_unlock(&element->LOCK_table_share);
DBUG_RETURN(0);
}
if (share->is_view && !(flags & GTS_VIEW))
{
open_table_error(share, OPEN_FRM_NOT_A_TABLE, ENOENT);
mysql_mutex_unlock(&element->LOCK_table_share);
DBUG_RETURN(0);
}
if (!share->is_view && !(flags & GTS_TABLE))
{
open_table_error(share, OPEN_FRM_NOT_A_VIEW, ENOENT);
mysql_mutex_unlock(&element->LOCK_table_share);
DBUG_RETURN(0);
}
ALTER TABLE COMMENTis made to work to validate this works.