diff --git a/sql/handler.cc b/sql/handler.cc index 7855a58c31103..8c9175855b7b6 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -7098,12 +7098,14 @@ bool ha_table_exists(THD *thd, const LEX_CSTRING *db, { if (!hton) hton= &dummy; - *hton= element->share->db_type(); - *is_sequence= element->share->table_type == TABLE_TYPE_SEQUENCE; - if (*hton != view_pseudo_hton && element->share->tabledef_version.length && + *hton= element->current()->share->db_type(); + *is_sequence= element->current()->share->table_type == TABLE_TYPE_SEQUENCE; + if (*hton != view_pseudo_hton && + element->current()->share->tabledef_version.length && table_id && (table_id->str= (uchar*) - thd->memdup(element->share->tabledef_version.str, MY_UUID_SIZE))) + thd->memdup(element->current()->share->tabledef_version.str, + MY_UUID_SIZE))) table_id->length= MY_UUID_SIZE; tdc_unlock_share(element); DBUG_RETURN(TRUE); diff --git a/sql/mdl.cc b/sql/mdl.cc index 2330357f8ad2b..7ab0e6b0b96db 100644 --- a/sql/mdl.cc +++ b/sql/mdl.cc @@ -3719,6 +3719,14 @@ void MDL_context::release_transactional_locks(THD *thd) DBUG_ASSERT(!(thd->server_status & (SERVER_STATUS_IN_TRANS | SERVER_STATUS_IN_TRANS_READONLY))); release_locks_stored_before(MDL_STATEMENT, NULL); + /* + Release schema bindings before MDL_TRANSACTION tickets. Each binding + holds a ref_count pin on its TABLE_SHARE_VERSION; dropping the pin + here may trigger GC of OLDER versions. Keeping the MDL_TRANSACTION + ticket held during this step prevents concurrent X-locking DDL from + racing with the share's tdc->LOCK_table_share manipulations. + */ + thd->release_transaction_schema_bindings(); release_locks_stored_before(MDL_TRANSACTION, NULL); DBUG_VOID_RETURN; } diff --git a/sql/sql_base.cc b/sql/sql_base.cc index 95abfa798bf65..41ee053136236 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -289,11 +289,14 @@ static my_bool list_open_tables_callback(void *el, void *a) (*arg->start_list)->in_use= 0; mysql_mutex_lock(&element->LOCK_table_share); - All_share_tables_list::Iterator it(element->all_tables); TABLE *table; - while ((table= it++)) - if (table->in_use) - ++(*arg->start_list)->in_use; + for (TABLE_SHARE_VERSION *v= element->versions_head; v; v= v->next) + { + All_share_tables_list::Iterator it(v->all_tables); + while ((table= it++)) + if (table->in_use) + ++(*arg->start_list)->in_use; + } mysql_mutex_unlock(&element->LOCK_table_share); (*arg->start_list)->locked= 0; /* Obsolete. */ arg->start_list= &(*arg->start_list)->next; @@ -476,28 +479,51 @@ static my_bool tc_collect_used_shares(void *el, void *a) DYNAMIC_ARRAY *shares= &arg->shares; mysql_mutex_lock(&element->LOCK_table_share); - if (element->ref_count > 0 && !element->share->is_view) + /* + "Table is in use" per FLUSH TABLES semantics = any version of this + name has ref_count > 0. After a lock-free DDL the chain may briefly + hold an OLDER version pinned by an in-flight transaction together + with a CURRENT version that no one has touched yet — we still need + to treat the table as in use. + + The do_flush properties (online_backup, table_category) and the + share we push for downstream processing are taken from current(), + which is what FLUSH TABLES will ultimately drain. + */ + bool any_in_use= false; + if (element->current()) + { + for (TABLE_SHARE_VERSION *v= element->versions_head; v; v= v->next) + { + if (v->ref_count > 0) + { + any_in_use= true; + break; + } + } + } + if (any_in_use && !element->current()->share->is_view) { - DBUG_ASSERT(element->share); + DBUG_ASSERT(element->current()->share); bool do_flush= 0; switch (arg->flush_type) { case FLUSH_ALL: do_flush= 1; break; case FLUSH_NON_TRANS_TABLES: - if (!element->share->online_backup && - element->share->table_category == TABLE_CATEGORY_USER) + if (!element->current()->share->online_backup && + element->current()->share->table_category == TABLE_CATEGORY_USER) do_flush= 1; break; case FLUSH_SYS_TABLES: - if (!element->share->online_backup && - element->share->table_category != TABLE_CATEGORY_USER) + if (!element->current()->share->online_backup && + element->current()->share->table_category != TABLE_CATEGORY_USER) do_flush= 1; } if (do_flush) { - element->ref_count++; // Protect against delete - if (push_dynamic(shares, (uchar*) &element->share)) + element->current()->ref_count++; // Protect against delete + if (push_dynamic(shares, (uchar*) &element->current()->share)) result= TRUE; } } @@ -600,7 +626,7 @@ bool flush_tables(THD *thd, flush_tables_type flag) { TABLE_SHARE *share= *dynamic_element(&collect_arg.shares, i, TABLE_SHARE**); - TABLE *table= tc_acquire_table(thd, share->tdc); + TABLE *table= tc_acquire_table(thd, share->tdc->current()); if (table) { (void) table->file->extra(HA_EXTRA_FLUSH); @@ -745,7 +771,7 @@ close_all_tables_for_name(THD *thd, TABLE_SHARE *share, TABLE *skip_table) { DBUG_ASSERT(!share->tmp_table); - DBUG_ASSERT(share->tdc->flushed); + DBUG_ASSERT(share->version->flushed); char key[MAX_DBKEY_LENGTH]; size_t key_length= share->table_cache_key.length; @@ -2245,7 +2271,7 @@ bool open_table(THD *thd, TABLE_LIST *table_list, Open_table_context *ot_ctx) if (!(flags & MYSQL_OPEN_IGNORE_FLUSH)) #endif { - if (share->tdc->flushed) + if (share->version->flushed) { /* We already have an MDL lock. But we have encountered an old @@ -2277,7 +2303,7 @@ bool open_table(THD *thd, TABLE_LIST *table_list, Open_table_context *ot_ctx) goto retry_share; } - if (thd->open_tables && thd->open_tables->s->tdc->flushed) + if (thd->open_tables && thd->open_tables->s->version->flushed) { /* If the version changes while we're opening the tables, diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 3856a1d4ed736..9f0c844fee980 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -863,6 +863,7 @@ THD::THD(my_thread_id id, bool is_wsrep_applier) is_slave_error= FALSE; my_hash_clear(&handler_tables_hash); my_hash_clear(&ull_hash); + my_hash_clear(&transaction_schema_bindings); tmp_table=0; cuted_fields= 0L; limit_found_rows= 0; @@ -1753,6 +1754,8 @@ void THD::cleanup(void) my_hash_free(&user_vars); my_hash_free(&sequences); + if (my_hash_inited(&transaction_schema_bindings)) + my_hash_free(&transaction_schema_bindings); sp_caches_clear(); statement_rcontext_reinit(); auto_inc_intervals_forced.empty(); @@ -6642,6 +6645,125 @@ void THD::leave_locked_tables_mode() locked_tables_mode= LTM_NONE; } + +/* + Per-transaction schema bindings. + + See declarations in sql_class.h. Bindings are populated by + tdc_acquire_share() on first open of a name within a transaction and + cleared by MDL_context::release_transactional_locks() at end of + transaction. Each binding pins its TABLE_SHARE_VERSION (ref_count +1) + so that the version stays alive across statement boundaries — letting + the transaction continue using its snapshot even after a concurrent + DDL has installed a newer version. +*/ + +namespace { +struct Schema_binding +{ + uchar key[NAME_LEN + 1 + NAME_LEN + 1]; + uint key_length; + TABLE_SHARE_VERSION *version; +}; + +extern "C" const uchar *schema_binding_key(const void *binding, size_t *length, + my_bool) +{ + const Schema_binding *b= static_cast(binding); + *length= b->key_length; + return b->key; +} +} // anonymous namespace + + +TABLE_SHARE_VERSION *THD::lookup_schema_binding(const uchar *key, + uint key_length) +{ + if (!my_hash_inited(&transaction_schema_bindings)) + return NULL; + Schema_binding *b= reinterpret_cast( + my_hash_search(&transaction_schema_bindings, key, key_length)); + return b ? b->version : NULL; +} + + +bool THD::add_schema_binding(const uchar *key, uint key_length, + TABLE_SHARE_VERSION *version) +{ + if (!my_hash_inited(&transaction_schema_bindings)) + { + if (my_hash_init(PSI_INSTRUMENT_ME, &transaction_schema_bindings, + &my_charset_bin, 0, 0, 0, schema_binding_key, + my_free, 0)) + return true; + } + Schema_binding *b= static_cast( + my_malloc(PSI_INSTRUMENT_ME, sizeof(*b), MYF(MY_WME))); + if (!b) + return true; + DBUG_ASSERT(key_length <= sizeof(b->key)); + memcpy(b->key, key, key_length); + b->key_length= key_length; + b->version= version; + + /* + The binding pins its version by holding a +1 on ref_count. The pin is + dropped by release_transaction_schema_bindings() at end of transaction, + or earlier by remove_schema_binding() when the share is being torn + down. + */ + mysql_mutex_lock(&version->share->tdc->LOCK_table_share); + version->ref_count++; + mysql_mutex_unlock(&version->share->tdc->LOCK_table_share); + + if (my_hash_insert(&transaction_schema_bindings, reinterpret_cast(b))) + { + /* OOM after the ref bump — undo the pin and free. */ + mysql_mutex_lock(&version->share->tdc->LOCK_table_share); + --version->ref_count; + mysql_mutex_unlock(&version->share->tdc->LOCK_table_share); + my_free(b); + return true; + } + return false; +} + + +void THD::remove_schema_binding(const uchar *key, uint key_length) +{ + if (!my_hash_inited(&transaction_schema_bindings)) + return; + Schema_binding *b= reinterpret_cast( + my_hash_search(&transaction_schema_bindings, key, key_length)); + if (!b) + return; + TABLE_SHARE *share= b->version->share; + /* my_hash_delete frees the Schema_binding (via the registered my_free). */ + my_hash_delete(&transaction_schema_bindings, reinterpret_cast(b)); + /* Drop the binding's ref. tdc_release_share dispatches CURRENT vs OLDER. */ + tdc_release_share(share); +} + + +void THD::release_transaction_schema_bindings() +{ + if (!my_hash_inited(&transaction_schema_bindings) || + transaction_schema_bindings.records == 0) + return; + /* + Walk every binding by index and release its version's ref. Calling + tdc_release_share may GC the version (if OLDER and ref reaches 0) but + does not touch our Schema_binding entry; my_hash_reset below frees those. + */ + for (ulong i= 0; i < transaction_schema_bindings.records; i++) + { + Schema_binding *b= reinterpret_cast( + my_hash_element(&transaction_schema_bindings, i)); + tdc_release_share(b->version->share); + } + my_hash_reset(&transaction_schema_bindings); +} + void THD::get_definer(LEX_USER *definer, bool role) { binlog_invoker(role); diff --git a/sql/sql_class.h b/sql/sql_class.h index 03394f46307c0..bbb01e60f4c2d 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -113,6 +113,7 @@ class rpl_io_thread_info; class rpl_sql_thread_info; #ifdef HAVE_REPLICATION struct Slave_info; +struct TABLE_SHARE_VERSION; #endif enum enum_ha_read_modes { RFIRST, RNEXT, RPREV, RLAST, RKEY, RNEXT_SAME }; @@ -3407,6 +3408,20 @@ class THD: public THD_count, /* this must be first */ chapter 'Miscellaneous functions', for functions GET_LOCK, RELEASE_LOCK. */ HASH ull_hash; + /* + Per-transaction bindings of (db, table_name) to the TABLE_SHARE_VERSION + the transaction first opened. Populated by tdc_acquire_share() on first + open of a name within the transaction; released at end of transaction + by release_transaction_schema_bindings(). + + Each binding holds a +1 on its version's ref_count for the lifetime of + the transaction. This pin keeps the version alive (and thus visible to + bound-path lookups in tdc_acquire_share) even after a concurrent DDL + has installed a newer version as the chain tail. + + Initialized lazily on first use. + */ + HASH transaction_schema_bindings; /* Hash of used sequences (for PREVIOUS value) */ HASH sequences; #ifdef DBUG_ASSERT_EXISTS @@ -5716,6 +5731,25 @@ class THD: public THD_count, /* this must be first */ if (!in_active_multi_stmt_transaction()) mdl_context.release_transactional_locks(this); } + + /* + Per-transaction schema binding API: + - lookup_schema_binding() returns the TABLE_SHARE_VERSION the txn is bound + to for this name, or NULL if not bound yet. + - add_schema_binding() records a binding and bumps version->ref_count; + returns true on OOM. + - remove_schema_binding() drops a single binding (and its ref) — used + when the share is being torn down out-of-band. + - release_transaction_schema_bindings() clears all bindings; called at + end of transaction from MDL_context::release_transactional_locks(). + */ + TABLE_SHARE_VERSION *lookup_schema_binding(const uchar *key, + uint key_length); + bool add_schema_binding(const uchar *key, uint key_length, + TABLE_SHARE_VERSION *version); + void remove_schema_binding(const uchar *key, uint key_length); + void release_transaction_schema_bindings(); + int decide_logging_format(TABLE_LIST *tables); /* diff --git a/sql/sql_handler.cc b/sql/sql_handler.cc index c5131e529f9e9..a518529bb8f12 100644 --- a/sql/sql_handler.cc +++ b/sql/sql_handler.cc @@ -1191,7 +1191,7 @@ void mysql_ha_flush(THD *thd) ((hash_tables->table->mdl_ticket && hash_tables->table->mdl_ticket->has_pending_conflicting_lock()) || (!hash_tables->table->s->tmp_table && - hash_tables->table->s->tdc->flushed))) + hash_tables->table->s->version->flushed))) mysql_ha_close_table(hash_tables); } diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc index c3d8824e6e6a9..72e6e20021402 100644 --- a/sql/sql_insert.cc +++ b/sql/sql_insert.cc @@ -3888,7 +3888,7 @@ bool Delayed_insert::handle_inserts(void) THD_STAGE_INFO(&thd, stage_insert); max_rows= delayed_insert_limit; - if (thd.killed || table->s->tdc->flushed) + if (thd.killed || table->s->version->flushed) { thd.set_killed(KILL_SYSTEM_THREAD); max_rows= ULONG_MAX; // Do as much as possible diff --git a/sql/sql_table.cc b/sql/sql_table.cc index b07f16102a6bb..95af78dbca6db 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -10680,6 +10680,122 @@ const char *online_alter_check_supported(THD *thd, } +/** + Lock-free ALTER TABLE for the small subset of changes that only update + SQL-layer metadata in TABLE_SHARE without touching the storage engine. + + Currently handles "ALTER TABLE t COMMENT='...'" only. Builds a new + TABLE_SHARE_VERSION by re-reading the existing .frm and overriding the + comment field, then installs it via tdc_install_version. In-flight + transactions keep using their bound version (old comment); new opens + see the new version (new comment). + + Known limitations of this initial implementation: + - The on-disk .frm is NOT yet rewritten. The new comment is only in + memory. Server restart will reload the original comment from disk. + A follow-up patch must call mysql_create_frm / create_table_impl to + persist the change. + - No binlog logging. The DDL is not replicated. + - No DDL log entry for crash recovery. + + None of these affect the lock-free demonstration: a long-running SELECT + in session A keeps seeing the old comment; ALTER in session B completes + without waiting; new SELECTs in session C see the new comment. +*/ + +static bool mysql_alter_table_comment_lockfree( + THD *thd, TABLE_LIST *table_list, + Table_specification_st *create_info) +{ + DBUG_ENTER("mysql_alter_table_comment_lockfree"); + + TABLE *table= table_list->table; + DBUG_ASSERT(table); + TABLE_SHARE *old_share= table->s; + TDC_element *element= old_share->tdc; + + /* + Allocate a fresh TABLE_SHARE for the new version. alloc_table_share + sets the .frm path based on (db, table_name), so open_table_def will + read the existing on-disk .frm. + */ + TABLE_SHARE *new_share= alloc_table_share(old_share->db.str, + old_share->table_name.str, + old_share->table_cache_key.str, + old_share->table_cache_key.length); + if (!new_share) + DBUG_RETURN(true); + + open_table_def(thd, new_share, GTS_TABLE | GTS_USE_DISCOVERY); + if (new_share->error != OPEN_FRM_OK) + { + free_table_share(new_share); + DBUG_RETURN(true); + } + + /* + Override the comment with the new value supplied by the user. The new + bytes are allocated from the new share's own mem_root so they survive + as long as the share. + */ + if (create_info->comment.str) + { + char *new_comment_str= + strmake_root(&new_share->mem_root, + create_info->comment.str, + create_info->comment.length); + if (!new_comment_str) + { + free_table_share(new_share); + DBUG_RETURN(true); + } + new_share->comment.str= new_comment_str; + new_share->comment.length= create_info->comment.length; + } + else + { + new_share->comment.str= ""; + new_share->comment.length= 0; + } + + /* Allocate the new version. */ + TABLE_SHARE_VERSION *v2= tdc_alloc_version(); + if (!v2) + { + free_table_share(new_share); + DBUG_RETURN(true); + } + + /* + Install v2 as the new tail of the version chain. Existing in-flight + transactions remain bound to v1 (the previous tail); new opens get v2. + No drain of in-use TABLEs, no close_all_tables_for_name. The SU MDL we + hold is compatible with concurrent SR/SW so DMLs are not blocked. + + tdc_install_version internally marks v1 as flushed and drains its + cached (idle) TABLEs in free_tables[]. In-use TABLEs (DMLs mid- + statement) are not touched; they get destroyed when their statement + ends via tc_release_table's flushed check. + */ + v2->share= new_share; + v2->ref_count= 0; + v2->flushed= false; + new_share->tdc= element; + tdc_install_version(element, v2); + + new_share->m_psi= PSI_CALL_get_table_share(false, new_share); + + /* + v1 stays alive while bound transactions reference it. When the last + such binding releases (at txn commit/rollback) and the last cached + TABLE in v1->free_tables[] is destroyed, v1->ref_count hits 0 and + tdc_release_share GCs it via the OLDER branch (tdc_gc_version). + */ + + DBUG_RETURN(false); +} + + /** Alter table @@ -10976,7 +11092,7 @@ bool mysql_alter_table(THD *thd, const LEX_CSTRING *new_db, Table is a shared table. Remove the .frm file. Discovery will create a new one if needed. */ - table->s->tdc->flushed= 1; // Force close of all instances + table->s->version->flushed= 1; // Force close of all instances if (thd->mdl_context.upgrade_shared_lock(mdl_ticket, MDL_EXCLUSIVE, thd->variables.lock_wait_timeout)) DBUG_RETURN(1); @@ -10997,6 +11113,33 @@ bool mysql_alter_table(THD *thd, const LEX_CSTRING *new_db, table->use_all_columns(); + /* + Lock-free ALTER TABLE for the small subset of changes that only update + SQL-layer metadata. Currently scoped to ALTER COMMENT. Other DDLs + fall through to the regular drain-and-replace path below. + + Conditions: + - Pure SQL ALTER (not partition admin or similar). + - No structural changes (alter_info->flags == 0). + - Only the COMMENT field changed in create_info. + - Not a rename. + - Base table, not a system or log table. + - Not under LOCK TABLES (the regular path has subtle locked-tables + bookkeeping we don't replicate here). + */ + if (thd->lex->sql_command == SQLCOM_ALTER_TABLE && + alter_info->flags == 0 && + create_info->used_fields == HA_CREATE_USED_COMMENT && + !alter_ctx.is_table_renamed() && + table->s->tmp_table == NO_TMP_TABLE && + table->s->table_category == TABLE_CATEGORY_USER && + thd->locked_tables_mode != LTM_LOCK_TABLES && + thd->locked_tables_mode != LTM_PRELOCKED_UNDER_LOCK_TABLES) + { + DBUG_RETURN(mysql_alter_table_comment_lockfree(thd, table_list, + create_info)); + } + /* Prohibit changing of the UNION list of a non-temporary MERGE table under LOCK tables. It would be quite difficult to reuse a shrinked diff --git a/sql/sql_test.cc b/sql/sql_test.cc index 55dce30872955..46e004938e66d 100644 --- a/sql/sql_test.cc +++ b/sql/sql_test.cc @@ -94,7 +94,12 @@ static my_bool print_cached_tables_callback(void *el, void*) TABLE *entry; mysql_mutex_lock(&element->LOCK_table_share); - All_share_tables_list::Iterator it(element->all_tables); + if (!element->current()) + { + mysql_mutex_unlock(&element->LOCK_table_share); + return FALSE; + } + All_share_tables_list::Iterator it(element->current()->all_tables); while ((entry= it++)) { THD *in_use= entry->in_use; diff --git a/sql/table.cc b/sql/table.cc index e150bd238392c..a15459dab16c8 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -5792,16 +5792,17 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush, /* To protect all_tables list from being concurrently modified - while we are iterating through it we increment tdc.all_tables_refs. + while we are iterating through it we increment version->all_tables_refs. This does not introduce deadlocks in the deadlock detector because we won't try to acquire tdc.LOCK_table_share while holding a write-lock on MDL_lock::m_rwlock. */ + TABLE_SHARE_VERSION *v= version; mysql_mutex_lock(&tdc->LOCK_table_share); - tdc->all_tables_refs++; + v->all_tables_refs++; mysql_mutex_unlock(&tdc->LOCK_table_share); - All_share_tables_list::Iterator tables_it(tdc->all_tables); + All_share_tables_list::Iterator tables_it(v->all_tables); /* In case of multiple searches running in parallel, avoid going @@ -5819,7 +5820,7 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush, while ((table= tables_it++)) { - DBUG_ASSERT(table->in_use && tdc->flushed); + DBUG_ASSERT(table->in_use && version->flushed); if (gvisitor->inspect_edge(&table->in_use->mdl_context)) { goto end_leave_node; @@ -5829,7 +5830,7 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush, tables_it.rewind(); while ((table= tables_it++)) { - DBUG_ASSERT(table->in_use && tdc->flushed); + DBUG_ASSERT(table->in_use && version->flushed); if (table->in_use->mdl_context.visit_subgraph(gvisitor)) { goto end_leave_node; @@ -5843,7 +5844,7 @@ bool TABLE_SHARE::visit_subgraph(Wait_for_flush *wait_for_flush, end: mysql_mutex_lock(&tdc->LOCK_table_share); - if (!--tdc->all_tables_refs) + if (!--v->all_tables_refs) mysql_cond_broadcast(&tdc->COND_release); mysql_mutex_unlock(&tdc->LOCK_table_share); @@ -5881,7 +5882,7 @@ bool TABLE_SHARE::wait_for_old_version(THD *thd, struct timespec *abstime, MDL_wait::enum_wait_status wait_status; mysql_mutex_assert_owner(&tdc->LOCK_table_share); - DBUG_ASSERT(tdc->flushed); + DBUG_ASSERT(version->flushed); DBUG_ASSERT(mdl_context->m_wait.get_status() == MDL_wait::EMPTY); tdc->m_flush_tickets.push_front(&ticket); @@ -5947,7 +5948,7 @@ bool TABLE_SHARE::wait_for_old_version(THD *thd, struct timespec *abstime, void TABLE::init(THD *thd, TABLE_LIST *tl) { - DBUG_ASSERT(s->tmp_table != NO_TMP_TABLE || s->tdc->ref_count > 0); + DBUG_ASSERT(s->tmp_table != NO_TMP_TABLE || s->version->ref_count > 0); if (thd->lex->need_correct_ident()) alias_name_used= !s->table_name.streq(tl->alias); diff --git a/sql/table.h b/sql/table.h index 0713341840127..850c9c765b5be 100644 --- a/sql/table.h +++ b/sql/table.h @@ -72,6 +72,7 @@ class Copy_field; class Table_statistics; class With_element; struct TDC_element; +struct TABLE_SHARE_VERSION; class Virtual_column_info; class Table_triggers_list; class TMP_TABLE_PARAM; @@ -754,6 +755,13 @@ struct TABLE_SHARE void unlock_share() { if (!tmp_table) mysql_mutex_unlock(&LOCK_share); } TDC_element *tdc; + /** + Back-pointer to the TABLE_SHARE_VERSION that owns this share. Set by + tdc_install_version() and stable for the share's lifetime. Used by + tc_release_table/tc_add_table to navigate to the per-version + all_tables/free_tables lists in O(1). + */ + TABLE_SHARE_VERSION *version; LEX_CUSTRING tabledef_version; diff --git a/sql/table_cache.cc b/sql/table_cache.cc index d2a95d90178d9..06a57732d89b5 100644 --- a/sql/table_cache.cc +++ b/sql/table_cache.cc @@ -257,30 +257,31 @@ uint tc_records(void) static void tc_remove_table(TABLE *table) { TDC_element *element= table->s->tdc; + TABLE_SHARE_VERSION *version= table->s->version; mysql_mutex_lock(&element->LOCK_table_share); - /* Wait for MDL deadlock detector to complete traversing tdc.all_tables. */ - while (element->all_tables_refs) + /* Wait for MDL deadlock detector to complete traversing all_tables. */ + while (version->all_tables_refs) mysql_cond_wait(&element->COND_release, &element->LOCK_table_share); - element->all_tables.remove(table); + version->all_tables.remove(table); mysql_mutex_unlock(&element->LOCK_table_share); intern_close_table(table); } -static void tc_remove_all_unused_tables(TDC_element *element, +static void tc_remove_all_unused_tables(TABLE_SHARE_VERSION *version, Share_free_tables::List *purge_tables) { for (uint32 i= 0; i < tc_instances; i++) { mysql_mutex_lock(&tc[i].LOCK_table_cache); - while (auto table= element->free_tables[i].list.pop_front()) + while (auto table= version->free_tables[i].list.pop_front()) { tc[i].records--; tc[i].free_tables.remove(table); - DBUG_ASSERT(element->all_tables_refs == 0); - element->all_tables.remove(table); + DBUG_ASSERT(version->all_tables_refs == 0); + version->all_tables.remove(table); purge_tables->push_front(table); } mysql_mutex_unlock(&tc[i].LOCK_table_cache); @@ -309,7 +310,15 @@ static my_bool tc_purge_callback(void *_element, void *_purge_tables) Share_free_tables::List *purge_tables= static_cast(_purge_tables); mysql_mutex_lock(&element->LOCK_table_share); - tc_remove_all_unused_tables(element, purge_tables); + /* + Walk every version's free_tables. After a lock-free DDL the chain may + briefly hold multiple versions (one CURRENT plus OLDER versions still + pinned by in-flight transactions). OLDER versions have their + free_tables drained at install time by tdc_install_version, so they + are usually empty here; CURRENT may have idle cached TABLEs to purge. + */ + for (TABLE_SHARE_VERSION *v= element->versions_head; v; v= v->next) + tc_remove_all_unused_tables(v, purge_tables); mysql_mutex_unlock(&element->LOCK_table_share); return FALSE; } @@ -347,14 +356,15 @@ void tc_add_table(THD *thd, TABLE *table) thd->thread_id % tc_active_instances.load(std::memory_order_relaxed); TABLE *LRU_table= 0; TDC_element *element= table->s->tdc; + TABLE_SHARE_VERSION *version= table->s->version; DBUG_ASSERT(table->in_use == thd); table->instance= i; mysql_mutex_lock(&element->LOCK_table_share); - /* Wait for MDL deadlock detector to complete traversing tdc.all_tables. */ - while (element->all_tables_refs) + /* Wait for MDL deadlock detector to complete traversing all_tables. */ + while (version->all_tables_refs) mysql_cond_wait(&element->COND_release, &element->LOCK_table_share); - element->all_tables.push_front(table); + version->all_tables.push_front(table); mysql_mutex_unlock(&element->LOCK_table_share); mysql_mutex_lock(&tc[i].LOCK_table_cache); @@ -362,7 +372,7 @@ void tc_add_table(THD *thd, TABLE *table) { if ((LRU_table= tc[i].free_tables.pop_front())) { - LRU_table->s->tdc->free_tables[i].list.remove(LRU_table); + LRU_table->s->version->free_tables[i].list.remove(LRU_table); /* Needed if MDL deadlock detector chimes in before tc_remove_table() */ LRU_table->in_use= thd; mysql_mutex_unlock(&tc[i].LOCK_table_cache); @@ -395,14 +405,15 @@ void tc_add_table(THD *thd, TABLE *table) @return TABLE object, or NULL if no unused objects. */ -TABLE *tc_acquire_table(THD *thd, TDC_element *element) +TABLE *tc_acquire_table(THD *thd, TABLE_SHARE_VERSION *version) { + DBUG_ASSERT(version); uint32_t n_instances= tc_active_instances.load(std::memory_order_relaxed); uint32_t i= thd->thread_id % n_instances; TABLE *table; tc[i].lock_and_check_contention(n_instances, i); - table= element->free_tables[i].list.pop_front(); + table= version->free_tables[i].list.pop_front(); if (table) { DBUG_ASSERT(!table->in_use); @@ -453,7 +464,7 @@ void tc_release_table(TABLE *table) DBUG_ASSERT(!table->pos_in_locked_tables); mysql_mutex_lock(&tc[i].LOCK_table_cache); - if (table->needs_reopen() || table->s->tdc->flushed || + if (table->needs_reopen() || table->s->version->flushed || tc[i].records > tc_size) { tc[i].records--; @@ -463,7 +474,7 @@ void tc_release_table(TABLE *table) else { table->in_use= 0; - table->s->tdc->free_tables[i].list.push_front(table); + table->s->version->free_tables[i].list.push_front(table); tc[i].free_tables.push_back(table); mysql_mutex_unlock(&tc[i].LOCK_table_cache); } @@ -471,17 +482,18 @@ void tc_release_table(TABLE *table) } +/* + Assert that the element is "clean": no version chain, no flush tickets, + not on the unused_shares LRU. Per-version state (all_tables, free_tables, + ref_count, etc.) is asserted by tdc_free_version on each version freed. + Called before the LF_HASH slot is reused or destroyed. +*/ + static void tdc_assert_clean_share(TDC_element *element) { - DBUG_ASSERT(element->share == 0); - DBUG_ASSERT(element->ref_count == 0); + DBUG_ASSERT(element->versions_head == 0); + DBUG_ASSERT(element->versions_tail == 0); DBUG_ASSERT(element->m_flush_tickets.is_empty()); - DBUG_ASSERT(element->all_tables.is_empty()); -#ifndef DBUG_OFF - for (uint32 i= 0; i < tc_instances; i++) - DBUG_ASSERT(element->free_tables[i].list.is_empty()); -#endif - DBUG_ASSERT(element->all_tables_refs == 0); DBUG_ASSERT(element->next == 0); DBUG_ASSERT(element->prev == 0); } @@ -489,6 +501,20 @@ static void tdc_assert_clean_share(TDC_element *element) /** Delete share from hash and free share object. + + Precondition: the version chain holds exactly one version (the CURRENT + one we're about to free). Callers must ensure all OLDER versions have + been GC'd before reaching here. The DBUG_ASSERTs below enforce this. + + Reasons this holds for current callers: + - tdc_release_share's CURRENT path checks versions_head == versions_tail + before invoking this (or pushing to unused_shares). + - tdc_remove_referenced_share is called under X MDL after a drain; any + OLDER versions would have been GC'd when conflicting transactions + released their bindings. + - tdc_remove_table goes through the same X-MDL flow. + - tdc_purge picks elements off unused_shares, which were placed there + by tdc_release_share under the same chain-length check. */ static void tdc_delete_share_from_hash(TDC_element *element) @@ -496,12 +522,21 @@ static void tdc_delete_share_from_hash(TDC_element *element) THD *thd= current_thd; LF_PINS *pins; TABLE_SHARE *share; + TABLE_SHARE_VERSION *version; DBUG_ENTER("tdc_delete_share_from_hash"); mysql_mutex_assert_owner(&element->LOCK_table_share); - share= element->share; + version= element->current(); + DBUG_ASSERT(version); + /* Precondition: chain length 1. */ + DBUG_ASSERT(version == element->versions_head); + DBUG_ASSERT(version->next == 0); + DBUG_ASSERT(version->prev == 0); + share= version->share; DBUG_ASSERT(share); - element->share= 0; + version->share= 0; + element->versions_head= 0; + element->versions_tail= 0; PSI_CALL_release_table_share(share->m_psi); share->m_psi= 0; @@ -534,6 +569,7 @@ static void tdc_delete_share_from_hash(TDC_element *element) if (!thd) lf_hash_put_pins(pins); free_table_share(share); + tdc_free_version(version); DBUG_VOID_RETURN; } @@ -550,12 +586,9 @@ static void lf_alloc_constructor(uchar *arg) &element->LOCK_table_share, MY_MUTEX_INIT_FAST); mysql_cond_init(key_TABLE_SHARE_COND_release, &element->COND_release, 0); element->m_flush_tickets.empty(); - element->all_tables.empty(); - for (uint32 i= 0; i < tc_instances; i++) - element->free_tables[i].list.empty(); - element->all_tables_refs= 0; - element->share= 0; - element->ref_count= 0; + element->versions_head= 0; + element->versions_tail= 0; + element->next_schema_version= 0; element->next= 0; element->prev= 0; DBUG_VOID_RETURN; @@ -577,6 +610,139 @@ static void lf_alloc_destructor(uchar *arg) } +/** + Allocate and initialize a TABLE_SHARE_VERSION with a trailing free_tables[] + array sized to tc_instances. Caller is responsible for installing the + version into a TDC_element via tdc_install_version() under LOCK_table_share. +*/ + +TABLE_SHARE_VERSION *tdc_alloc_version() +{ + size_t size= sizeof(TABLE_SHARE_VERSION) + + sizeof(Share_free_tables) * (tc_instances - 1); + TABLE_SHARE_VERSION *v= (TABLE_SHARE_VERSION*) + my_malloc(PSI_INSTRUMENT_ME, size, MYF(MY_WME | MY_ZEROFILL)); + if (!v) + return NULL; + v->all_tables.empty(); + for (uint32 i= 0; i < tc_instances; i++) + v->free_tables[i].list.empty(); + return v; +} + + +/** + Append a TABLE_SHARE_VERSION to a TDC_element's chain as the new tail + (i.e. the new "current" version served to new opens). Assigns + schema_version from the element's per-element monotonic counter. + + If the chain already had a tail, that tail is demoted to OLDER: marked + flushed (so future tc_release_table calls destroy its TABLEs rather + than cache them) and its existing cached free_tables[] are drained + eagerly. In-use TABLEs of the demoted version are untouched and continue + serving their statements; they get destroyed on their next release. + + Takes e->LOCK_table_share internally; caller should not hold it. +*/ + +void tdc_install_version(TDC_element *e, TABLE_SHARE_VERSION *v) +{ + DBUG_ASSERT(v->next == 0 && v->prev == 0); + DBUG_ASSERT(v->share); + + /* + Cached TABLEs in the previous tail's free_tables[] become unreachable + once we demote that tail to OLDER (no new transaction will bind to an + OLDER version, and existing v1-bound transactions will allocate fresh + TABLEs on their next open if free_tables is empty). We mark the + previous tail flushed so future tc_release_table calls destroy v1's + TABLEs instead of caching them, and we eagerly drain whatever idle + TABLEs are already in free_tables[] here. In-use TABLEs (table->in_use + != 0) are not in free_tables[] by invariant; they continue serving + their statement and get destroyed later when that statement ends. + Once both routes drop v1's ref_count to 0, tdc_release_share's OLDER + branch GCs the version. + */ + Share_free_tables::List purge_tables; + + mysql_mutex_lock(&e->LOCK_table_share); + v->schema_version= ++e->next_schema_version; + v->prev= e->versions_tail; + v->next= 0; + if (e->versions_tail) + { + e->versions_tail->next= v; + e->versions_tail->flushed= true; + tc_remove_all_unused_tables(e->versions_tail, &purge_tables); + } + e->versions_tail= v; + if (!e->versions_head) + e->versions_head= v; + v->share->version= v; + mysql_mutex_unlock(&e->LOCK_table_share); + + while (auto table= purge_tables.pop_front()) + intern_close_table(table); +} + + +/** + Remove an OLDER (i.e. non-tail) TABLE_SHARE_VERSION from a TDC_element's + chain and free its TABLE_SHARE and the version itself. + + Called from tdc_release_share's OLDER branch when an OLDER version's + ref_count drops to 0 (the last in-use TABLE was destroyed or the last + binding to it was released). + + Caller must own e->LOCK_table_share. v must satisfy: + - v != e->versions_tail (the current version is never GC'd here; + it's freed by tdc_delete_share_from_hash when the element is removed) + - v->ref_count == 0 +*/ + +void tdc_gc_version(TDC_element *e, TABLE_SHARE_VERSION *v) +{ + mysql_mutex_assert_owner(&e->LOCK_table_share); + DBUG_ASSERT(v != e->versions_tail); + DBUG_ASSERT(v->ref_count == 0); + + if (v->prev) + v->prev->next= v->next; + if (v->next) + v->next->prev= v->prev; + if (v == e->versions_head) + e->versions_head= v->next; + + TABLE_SHARE *share= v->share; + v->share= 0; + v->next= v->prev= 0; + free_table_share(share); + tdc_free_version(v); +} + + +/** + Free a TABLE_SHARE_VERSION. The caller must have already detached it from + any TDC_element's chain and freed the underlying TABLE_SHARE (so + v->share is 0). DBUG_ASSERTs below enforce the expected clean state: + no share, no refs, empty all_tables, no MDL deadlock-detector refs on + all_tables, and empty per-instance free_tables[]. +*/ + +void tdc_free_version(TABLE_SHARE_VERSION *v) +{ + DBUG_ASSERT(v->share == 0); + DBUG_ASSERT(v->ref_count == 0); + DBUG_ASSERT(v->all_tables.is_empty()); + DBUG_ASSERT(v->all_tables_refs == 0); +#ifndef DBUG_OFF + for (uint32 i= 0; i < tc_instances; i++) + DBUG_ASSERT(v->free_tables[i].list.is_empty()); +#endif + my_free(v); +} + + static void tdc_hash_initializer(LF_HASH *, void *_element, const void *_key) { @@ -615,9 +781,7 @@ bool tdc_init(void) tdc_inited= true; mysql_mutex_init(key_LOCK_unused_shares, &LOCK_unused_shares, MY_MUTEX_INIT_FAST); - lf_hash_init(&tdc_hash, - sizeof(TDC_element) + - sizeof(Share_free_tables) * (tc_instances - 1), + lf_hash_init(&tdc_hash, sizeof(TDC_element), LF_HASH_UNIQUE, 0, 0, tdc_hash_key, &my_charset_bin); tdc_hash.alloc.constructor= lf_alloc_constructor; tdc_hash.alloc.destructor= lf_alloc_destructor; @@ -705,7 +869,7 @@ void tdc_purge(bool all) element->prev= 0; element->next= 0; mysql_mutex_lock(&element->LOCK_table_share); - if (element->ref_count) + if (element->current() && element->current()->ref_count) { mysql_mutex_unlock(&element->LOCK_table_share); mysql_mutex_unlock(&LOCK_unused_shares); @@ -751,7 +915,8 @@ TDC_element *tdc_lock_share(THD *thd, const char *db, const char *table_name) if (element) { mysql_mutex_lock(&element->LOCK_table_share); - if (unlikely(!element->share || element->share->error)) + if (unlikely(!element->current() || + !element->current()->share || element->current()->share->error)) { mysql_mutex_unlock(&element->LOCK_table_share); element= 0; @@ -815,6 +980,8 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, { TABLE_SHARE *share; TDC_element *element; + TABLE_SHARE_VERSION *bound= NULL; + TABLE_SHARE_VERSION *cur; const char *key; uint key_length= get_table_def_key(tl, &key); my_hash_value_type hash_value= tl->mdl_request.key.tc_hash_value(); @@ -824,6 +991,62 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, if (fix_thd_pins(thd)) DBUG_RETURN(0); + /* + If this transaction is already bound to a version of this name, use + that version (which may be OLDER than versions_tail when DDL has + installed newer versions). The binding pins the version via + ref_count, so it stays alive for the rest of this transaction. + + GTS_FORCE_DISCOVERY and GTS_NOLOCK callers don't want a binding-based + fast path — they explicitly want to re-read from disk or merely probe + existence — so they go through the lf_hash lookup path. + */ + if (!(flags & (GTS_FORCE_DISCOVERY | GTS_NOLOCK))) + bound= thd->lookup_schema_binding((const uchar*) key, key_length); + + if (bound) + { + element= bound->share->tdc; + share= bound->share; + DBUG_ASSERT(share->tdc == element); + + if (out_table && (flags & GTS_TABLE)) + { + if ((*out_table= tc_acquire_table(thd, bound))) + { + DBUG_ASSERT(!share->error); + DBUG_ASSERT(!share->is_view); + status_var_increment(thd->status_var.table_open_cache_hits); + goto end; + } + status_var_increment(thd->status_var.table_open_cache_misses); + } + + mysql_mutex_lock(&element->LOCK_table_share); + if (unlikely(share->error)) + { + open_table_error(share, share->error, share->open_errno); + goto err; + } + if (share->is_view && !(flags & GTS_VIEW)) + { + open_table_error(share, OPEN_FRM_NOT_A_TABLE, ENOENT); + goto err; + } + if (!share->is_view && !(flags & GTS_TABLE)) + { + open_table_error(share, OPEN_FRM_NOT_A_VIEW, ENOENT); + goto err; + } + /* + Bound version has ref_count >= 1 (the binding pin), so was_unused is + always false here — no unused_shares LRU dance needed. + */ + bound->ref_count++; + mysql_mutex_unlock(&element->LOCK_table_share); + goto end; + } + retry: while (!(element= (TDC_element*) lf_hash_search_using_hash_value(&tdc_hash, thd->tdc_hash_pins, hash_value, (uchar*) key, key_length))) @@ -839,14 +1062,22 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, element= (TDC_element*) lf_hash_search_using_hash_value(&tdc_hash, thd->tdc_hash_pins, hash_value, (uchar*) key, key_length); /* It's safe to unpin the pins here, because an empty element was inserted - above, "empty" means at least element->share = 0. Some other thread can't - delete it while element->share == 0. And element->share is also protected - with element->LOCK_table_share mutex. */ + above, "empty" means at least element->versions_tail = 0. Some other + thread can't delete it while versions_tail == 0. And the chain is + protected with element->LOCK_table_share mutex. */ lf_hash_search_unpin(thd->tdc_hash_pins); DBUG_ASSERT(element); + TABLE_SHARE_VERSION *version= tdc_alloc_version(); + if (!version) + { + lf_hash_delete(&tdc_hash, thd->tdc_hash_pins, key, key_length); + DBUG_RETURN(0); + } + if (!(share= alloc_table_share(tl->db.str, tl->table_name.str, key, key_length))) { + tdc_free_version(version); lf_hash_delete(&tdc_hash, thd->tdc_hash_pins, key, key_length); DBUG_RETURN(0); } @@ -857,16 +1088,21 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, if (checked_unlikely(share->error)) { free_table_share(share); + tdc_free_version(version); lf_hash_delete(&tdc_hash, thd->tdc_hash_pins, key, key_length); DBUG_RETURN(0); } - mysql_mutex_lock(&element->LOCK_table_share); - element->share= share; + /* + version is freshly allocated and not yet visible to other threads; + the field sets below don't need LOCK_table_share. tdc_install_version + takes the lock internally to chain the version into element->versions_*. + */ + version->share= share; + version->ref_count= 1; + version->flushed= false; share->tdc= element; - element->ref_count++; - element->flushed= false; - mysql_mutex_unlock(&element->LOCK_table_share); + tdc_install_version(element, version); tdc_purge(false); if (out_table) @@ -881,23 +1117,44 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, /* cannot force discovery of a cached share */ DBUG_ASSERT(!(flags & GTS_FORCE_DISCOVERY)); + /* + The LF_HASH pin protects the TDC_element memory but NOT the heap-allocated + TABLE_SHARE_VERSION it points to: a concurrent tdc_purge can drop the + element off unused_shares LRU and free its versions while we still hold + the pin. Take LOCK_table_share before reading element->current() so the + version stays pinned while we use it. The lock is held continuously + through both the cache-hit fast path and the slow path; tc_acquire_table + nests fine because LOCK_table_share → tc[i].LOCK_table_cache is the same + ordering tc_remove_all_unused_tables already uses. + */ + mysql_mutex_lock(&element->LOCK_table_share); + cur= element->current(); + if (!cur) + { + mysql_mutex_unlock(&element->LOCK_table_share); + lf_hash_search_unpin(thd->tdc_hash_pins); + std::this_thread::yield(); + goto retry; + } + if (out_table && (flags & GTS_TABLE)) { - if ((*out_table= tc_acquire_table(thd, element))) + if ((*out_table= tc_acquire_table(thd, cur))) { - lf_hash_search_unpin(thd->tdc_hash_pins); DBUG_ASSERT(!(flags & GTS_NOLOCK)); - DBUG_ASSERT(element->share); - DBUG_ASSERT(!element->share->error); - DBUG_ASSERT(!element->share->is_view); + DBUG_ASSERT(cur->share); + DBUG_ASSERT(!cur->share->error); + DBUG_ASSERT(!cur->share->is_view); status_var_increment(thd->status_var.table_open_cache_hits); - DBUG_RETURN(element->share); + share= cur->share; + mysql_mutex_unlock(&element->LOCK_table_share); + lf_hash_search_unpin(thd->tdc_hash_pins); + goto end; } status_var_increment(thd->status_var.table_open_cache_misses); } - mysql_mutex_lock(&element->LOCK_table_share); - if (!(share= element->share)) + if (!(share= cur->share)) { mysql_mutex_unlock(&element->LOCK_table_share); lf_hash_search_unpin(thd->tdc_hash_pins); @@ -927,8 +1184,8 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, goto err; } - was_unused= !element->ref_count; - element->ref_count++; + was_unused= !cur->ref_count; + cur->ref_count++; mysql_mutex_unlock(&element->LOCK_table_share); if (was_unused) { @@ -949,7 +1206,23 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, end: DBUG_PRINT("exit", ("share: %p ref_count: %u", - share, share->tdc->ref_count)); + share, share->version->ref_count)); + /* + Record that this transaction is using the current version of this + name. The binding pins the version via ref_count and is released at + end of transaction by MDL_context::release_transactional_locks(). + + If we took the bound branch above, `bound` is non-NULL and we already + have a binding to that version — skip. Otherwise install a binding to + the version we just acquired (which is current() since the unbound + path always uses current()). Skip for GTS_NOLOCK callers (they're + just probing existence). + */ + if (!(flags & GTS_NOLOCK) && !bound) + { + (void) thd->add_schema_binding((const uchar*) key, key_length, + share->version); + } if (flags & GTS_NOLOCK) { tdc_release_share(share); @@ -976,46 +1249,82 @@ TABLE_SHARE *tdc_acquire_share(THD *thd, TABLE_LIST *tl, uint flags, void tdc_release_share(TABLE_SHARE *share) { + TABLE_SHARE_VERSION *v= share->version; + TDC_element *e= share->tdc; DBUG_ENTER("tdc_release_share"); + DBUG_ASSERT(v); - mysql_mutex_lock(&share->tdc->LOCK_table_share); + mysql_mutex_lock(&e->LOCK_table_share); DBUG_PRINT("enter", ("share: %p table: %s.%s ref_count: %u", - share, share->db.str, share->table_name.str, - share->tdc->ref_count)); - DBUG_ASSERT(share->tdc->ref_count); + share, share->db.str, share->table_name.str, v->ref_count)); + DBUG_ASSERT(v->ref_count); + + /* + OLDER version: not visible to new opens. Decrement and, if no refs + remain, GC the version. No LRU/eviction logic — OLDER versions are + transient; they only exist while in-flight transactions still pin them. + */ + if (v != e->versions_tail) + { + --v->ref_count; + if (!share->is_view) + mysql_cond_broadcast(&e->COND_release); + if (v->ref_count == 0) + tdc_gc_version(e, v); + mysql_mutex_unlock(&e->LOCK_table_share); + DBUG_VOID_RETURN; + } - if (share->tdc->ref_count > 1) + /* CURRENT (versions_tail) — existing LRU/eviction behavior. */ + if (v->ref_count > 1) { - share->tdc->ref_count--; + v->ref_count--; if (!share->is_view) - mysql_cond_broadcast(&share->tdc->COND_release); - mysql_mutex_unlock(&share->tdc->LOCK_table_share); + mysql_cond_broadcast(&e->COND_release); + mysql_mutex_unlock(&e->LOCK_table_share); DBUG_VOID_RETURN; } - mysql_mutex_unlock(&share->tdc->LOCK_table_share); + mysql_mutex_unlock(&e->LOCK_table_share); mysql_mutex_lock(&LOCK_unused_shares); - mysql_mutex_lock(&share->tdc->LOCK_table_share); - if (--share->tdc->ref_count) + mysql_mutex_lock(&e->LOCK_table_share); + if (--v->ref_count) { if (!share->is_view) - mysql_cond_broadcast(&share->tdc->COND_release); - mysql_mutex_unlock(&share->tdc->LOCK_table_share); + mysql_cond_broadcast(&e->COND_release); + mysql_mutex_unlock(&e->LOCK_table_share); mysql_mutex_unlock(&LOCK_unused_shares); DBUG_VOID_RETURN; } - if (share->tdc->flushed || tdc_records() > tdc_size) + /* + CURRENT's ref_count just hit 0. We may only put the element on the + unused_shares LRU or delete it from the hash when the chain holds + exactly one version (this one). If OLDER versions are still pinned + (in-use TABLEs from DMLs bound to earlier versions), the element is + not truly idle and must stay out of the LRU. Some future event — the + last OLDER version's GC, or a fresh open re-bumping ref_count — will + bring this element back into a normal state. + */ + if (e->versions_head != e->versions_tail) { + if (!share->is_view) + mysql_cond_broadcast(&e->COND_release); + mysql_mutex_unlock(&e->LOCK_table_share); mysql_mutex_unlock(&LOCK_unused_shares); - tdc_delete_share_from_hash(share->tdc); + DBUG_VOID_RETURN; + } + if (v->flushed || tdc_records() > tdc_size) + { + mysql_mutex_unlock(&LOCK_unused_shares); + tdc_delete_share_from_hash(e); DBUG_VOID_RETURN; } /* Link share last in used_table_share list */ DBUG_PRINT("info", ("moving share to unused list")); - DBUG_ASSERT(share->tdc->next == 0); - unused_shares.push_back(share->tdc); - mysql_mutex_unlock(&share->tdc->LOCK_table_share); + DBUG_ASSERT(e->next == 0); + unused_shares.push_back(e); + mysql_mutex_unlock(&e->LOCK_table_share); mysql_mutex_unlock(&LOCK_unused_shares); DBUG_VOID_RETURN; } @@ -1026,12 +1335,21 @@ void tdc_remove_referenced_share(THD *thd, TABLE_SHARE *share) DBUG_ASSERT(thd->mdl_context.is_lock_owner(MDL_key::TABLE, share->db.str, share->table_name.str, MDL_EXCLUSIVE)); + /* + The share is about to be deleted from the TDC. Drop our own schema + binding (if any) so it doesn't dangle past share deletion and so the + wait/decrement below sees the expected ref_count. + */ + thd->remove_schema_binding( + reinterpret_cast(share->table_cache_key.str), + share->table_cache_key.length); + share->tdc->flush_unused(true); mysql_mutex_lock(&share->tdc->LOCK_table_share); DEBUG_SYNC(thd, "before_wait_for_refs"); share->tdc->wait_for_refs(1); - DBUG_ASSERT(share->tdc->all_tables.is_empty()); - share->tdc->ref_count--; + DBUG_ASSERT(share->version->all_tables.is_empty()); + share->version->ref_count--; tdc_delete_share_from_hash(share->tdc); } @@ -1065,7 +1383,7 @@ void tdc_remove_table(THD *thd, const char *db, const char *table_name) DBUG_ASSERT(element != MY_ERRPTR); // What can we do about it? - if (!element->ref_count) + if (!element->current()->ref_count) { if (element->prev) { @@ -1080,11 +1398,11 @@ void tdc_remove_table(THD *thd, const char *db, const char *table_name) } mysql_mutex_unlock(&LOCK_unused_shares); - element->ref_count++; + element->current()->ref_count++; mysql_mutex_unlock(&element->LOCK_table_share); /* We have to relock the mutex to avoid code duplication. Sigh. */ - tdc_remove_referenced_share(thd, element->share); + tdc_remove_referenced_share(thd, element->current()->share); DBUG_VOID_RETURN; } @@ -1112,11 +1430,12 @@ int tdc_wait_for_old_version(THD *thd, const char *db, const char *table_name, return FALSE; else if (element == MY_ERRPTR) return TRUE; - else if (element->flushed) + else if (element->current()->flushed) { struct timespec abstime; set_timespec(abstime, wait_timeout); - return element->share->wait_for_old_version(thd, &abstime, deadlock_weight); + return element->current()->share->wait_for_old_version(thd, &abstime, + deadlock_weight); } tdc_unlock_share(element); return FALSE; @@ -1259,7 +1578,7 @@ int show_tc_active_instances(THD *thd, SHOW_VAR *var, void *buff, void TDC_element::wait_for_refs(uint my_refs) { - while (ref_count > my_refs) + while (current()->ref_count > my_refs) mysql_cond_wait(&COND_release, &LOCK_table_share); } @@ -1277,20 +1596,51 @@ void TDC_element::wait_for_refs(uint my_refs) void TDC_element::flush(THD *thd, bool mark_flushed) { - DBUG_ASSERT(thd->mdl_context.is_lock_owner(MDL_key::TABLE, share->db.str, - share->table_name.str, + DBUG_ASSERT(current()); + DBUG_ASSERT(thd->mdl_context.is_lock_owner(MDL_key::TABLE, + current()->share->db.str, + current()->share->table_name.str, MDL_EXCLUSIVE)); + const uchar *key= + reinterpret_cast(current()->share->table_cache_key.str); + uint key_len= current()->share->table_cache_key.length; + + /* + If mark_flushed, the share is going away — drop our binding before + the drain so the wait doesn't include our binding's pin and so the + binding doesn't dangle past share deletion. + */ + if (mark_flushed) + thd->remove_schema_binding(key, key_len); flush_unused(mark_flushed); mysql_mutex_lock(&LOCK_table_share); - All_share_tables_list::Iterator it(all_tables); + /* + Between the unlock in flush_unused and the re-lock here, a concurrent + tdc_purge may have picked our element off the unused_shares LRU and + deleted it (the element's memory persists via LF_HASH hazard pointer + so the mutex above is still valid, but current() is NULL now). With + the share already gone there's nothing left to drain or wait for. + */ + if (!current()) + { + mysql_mutex_unlock(&LOCK_table_share); + return; + } + All_share_tables_list::Iterator it(current()->all_tables); uint my_refs= 0; while (auto table= it++) { if (table->in_use == thd) my_refs++; } + /* + If we didn't drop our binding above, it still pins +1 to ref_count. + Count it so the wait doesn't hang. + */ + if (thd->lookup_schema_binding(key, key_len) == current()) + my_refs++; wait_for_refs(my_refs); #ifndef DBUG_OFF it.rewind(); @@ -1310,9 +1660,10 @@ void TDC_element::flush_unused(bool mark_flushed) Share_free_tables::List purge_tables; mysql_mutex_lock(&LOCK_table_share); - if (mark_flushed) - flushed= true; - tc_remove_all_unused_tables(this, &purge_tables); + if (mark_flushed && current()) + current()->flushed= true; + if (current()) + tc_remove_all_unused_tables(current(), &purge_tables); mysql_mutex_unlock(&LOCK_table_share); while (auto table= purge_tables.pop_front()) diff --git a/sql/table_cache.h b/sql/table_cache.h index 71704ff2ed082..ea02c80275b57 100644 --- a/sql/table_cache.h +++ b/sql/table_cache.h @@ -27,33 +27,81 @@ struct Share_free_tables }; +/** + One schema version of a table. + + TDC_element holds a doubly-linked list of TABLE_SHARE_VERSIONs ordered + oldest → newest, with versions_tail being the version served to new opens. + In-flight transactions keep using an older TABLE_SHARE_VERSION while DDL + appends a newer one as the new tail. + + All per-share TABLE-cache state lives here: every TABLE points to one share, + and a TABLE_SHARE_VERSION owns the set of TABLEs for its version (both the + in-use list `all_tables` and the per-cache-instance idle list `free_tables`). +*/ + +struct TABLE_SHARE_VERSION +{ + /** + Monotonic version number, allocated from TDC_element::next_schema_version + at install time. Unique within the lifetime of one TDC_element entry; + used for ordering and diagnostics. Not persisted, not propagated via + replication. For cross-server schema fingerprinting see + TABLE_SHARE::tabledef_version (UUID for tables, timestamp for views). + */ + uint64_t schema_version; + TABLE_SHARE *share; + uint ref_count; /* How many TABLE objects use this */ + uint all_tables_refs; /* Number of refs to all_tables */ + bool flushed; + /** Chain links in TDC_element. NULL on a version that hasn't been + installed yet, or after it's been GC'd. */ + TABLE_SHARE_VERSION *next, *prev; + /* + Doubly-linked (back-linked) lists of used and unused TABLE objects + for this version. Protected by TDC_element::LOCK_table_share and + (for free_tables[i]) tc[i].LOCK_table_cache. + */ + All_share_tables_list all_tables; + /** Avoid false sharing between header fields and free_tables */ + char pad[CPU_LEVEL1_DCACHE_LINESIZE]; + /** Idle TABLE objects per cache instance. Sized to tc_instances at alloc. */ + Share_free_tables free_tables[1]; +}; + + struct TDC_element { uchar m_key[NAME_LEN + 1 + NAME_LEN + 1]; uint m_key_length; - bool flushed; - TABLE_SHARE *share; /** - Protects ref_count, m_flush_tickets, all_tables, flushed, all_tables_refs. + Protects m_flush_tickets, the version chain + (versions_head/versions_tail/next_schema_version), and the mutable fields + inside each TABLE_SHARE_VERSION (ref_count, share, flushed, next/prev, + all_tables, all_tables_refs). */ mysql_mutex_t LOCK_table_share; mysql_cond_t COND_release; TDC_element *next, **prev; /* Link to unused shares */ - uint ref_count; /* How many TABLE objects uses this */ - uint all_tables_refs; /* Number of refs to all_tables */ /** List of tickets representing threads waiting for the share to be flushed. + Per-name, not per-version: a flush wakes up everyone waiting on this name. */ Wait_for_flush_list m_flush_tickets; - /* - Doubly-linked (back-linked) lists of used and unused TABLE objects - for this share. + + /** + Version chain. Ordered oldest → newest. versions_tail is the version + served to new opens (== "current"). NULL when the element is being + initialized (between lf_hash_insert and the first tdc_install_version). */ - All_share_tables_list all_tables; - /** Avoid false sharing between TDC_element and free_tables */ - char pad[CPU_LEVEL1_DCACHE_LINESIZE]; - Share_free_tables free_tables[1]; + TABLE_SHARE_VERSION *versions_head; + TABLE_SHARE_VERSION *versions_tail; + /** Per-element monotonic counter for TABLE_SHARE_VERSION::schema_version. */ + uint64_t next_schema_version; + + /** The version served to new opens; NULL if none installed yet. */ + TABLE_SHARE_VERSION *current() const { return versions_tail; } inline void wait_for_refs(uint my_refs); void flush(THD *thd, bool mark_flushed); @@ -92,7 +140,18 @@ int show_tc_active_instances(THD *thd, SHOW_VAR *var, void *buff, extern void tc_purge(); extern void tc_add_table(THD *thd, TABLE *table); extern void tc_release_table(TABLE *table); -extern TABLE *tc_acquire_table(THD *thd, TDC_element *element); +extern TABLE *tc_acquire_table(THD *thd, TABLE_SHARE_VERSION *version); + +/* + Multi-version TDC helpers (used by lock-free DDL paths). + Callers must hold LOCK_table_share for tdc_install_version and + tdc_gc_version. tdc_alloc_version/tdc_free_version do their own + allocation without locks. +*/ +extern TABLE_SHARE_VERSION *tdc_alloc_version(); +extern void tdc_free_version(TABLE_SHARE_VERSION *v); +extern void tdc_install_version(TDC_element *e, TABLE_SHARE_VERSION *v); +extern void tdc_gc_version(TDC_element *e, TABLE_SHARE_VERSION *v); /** Create a table cache key for non-temporary table.