# 全表 update
# 背景
前几天在写 php 脚本的时候,复制别人的脚本。结果我改写过程中,update 的条件漏了 where 条件,当时就 throw error 了,按照我的经验我以为执行已经结束了,后面实际并没有执行结束。
所以说,我对一个,线上,非常关键的表,量级大概 20g 数据,4 千万条数据的表执行了全表 update,并且执行了近乎一个小时。
按照我对 mysql 事务的理解,一个 update 应该是一个事务,哪怕执行了一个小时,只要没有完全执行成功,都是不会实际更改表数据的。
所以,我打算复现一下四千万级别的数据,执行全表 update ,执行一会之后再 kill,校验数据是否正确,验证是否数据会有异常
# 初始流程
# 表结构
CREATE TABLE `table_update` ( | |
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, | |
`num` int(11) NOT NULL DEFAULT '0', | |
`md5` varchar(16) NOT NULL DEFAULT '', | |
`type` int(11) NOT NULL DEFAULT '0', | |
`pid` int(11) NOT NULL DEFAULT '0', | |
PRIMARY KEY (`id`) | |
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4; |
其中,md5 为 num 的 md5,type 为 num%10,如果 num%10 >7 这 type =7
mysql 机器为 4c8g 的小主机,500g 固态硬盘。
Server version: 5.7.42 MySQL Community Server (GPL)
# 导入数据
package main | |
import ( | |
"crypto/md5" | |
"database/sql" | |
"encoding/hex" | |
"fmt" | |
"runtime" | |
"strconv" | |
"sync" | |
_ "github.com/go-sql-driver/mysql" | |
) | |
var DB *sql.DB | |
var lock sync.Mutex | |
type Data struct { | |
Num int | |
MD5 string | |
TypeNum int | |
} | |
func calculateMD5(num int) string { | |
hasher := md5.New() | |
hasher.Write([]byte(strconv.Itoa(num))) | |
return hex.EncodeToString(hasher.Sum(nil))[:16] | |
} | |
func connectToDB() (*sql.DB, error) { | |
db, err := sql.Open("mysql", "root:pass@tcp(localhost:3306)/log_analysis") | |
return db, err | |
} | |
func main() { | |
runtime.GOMAXPROCS(4) | |
var err error | |
DB, err = connectToDB() | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
defer DB.Close() | |
var maxID sql.NullInt64 | |
err = DB.QueryRow("SELECT MAX(id) FROM table_update").Scan(&maxID) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
start := 1 | |
if maxID.Valid { | |
start = int(maxID.Int64) + 1 | |
} | |
dataChan := make(chan Data, 1000) | |
var wg sync.WaitGroup | |
for i := 0; i < 10; i++ { | |
wg.Add(1) | |
go func(pid int) { | |
defer wg.Done() | |
for data := range dataChan { | |
err := insertToDb(data.Num, data.TypeNum, pid, data.MD5) | |
if err != nil { | |
fmt.Println(err) | |
continue | |
} | |
} | |
}(i) | |
} | |
go func() { | |
for i := start + 1; i <= 40000000; i++ { | |
num := i | |
md5 := calculateMD5(num) | |
typeNum := num % 10 | |
if typeNum > 7 { | |
typeNum = 7 | |
} | |
dataChan <- Data{Num: num, MD5: md5, TypeNum: typeNum} | |
} | |
close(dataChan) | |
}() | |
wg.Wait() | |
} | |
func insertToDb(num, typeNum, pid int, md5 string) error { | |
// lock.Lock() | |
// defer lock.Unlock() | |
if num % 10000 == 0 { | |
fmt.Println("Inserting", num, md5, typeNum, pid) | |
} | |
_, err := DB.Exec("INSERT INTO table_update (num, md5, type, pid) VALUES (?, ?, ?, ?)", num, md5, typeNum, pid) | |
return err | |
} |
# 验证数据
mysql> select count(1) from table_update; | |
+----------+ | |
| count(1) | | |
+----------+ | |
| 39999999 | | |
+----------+ | |
1 row in set (5.82 sec) | |
mysql> SELECT * FROM table_update WHERE (type < 7 and type != (num%10) ) or(type = 7 and (num%10) not in (7,8,9) ) limit 100; | |
Empty set (13.12 sec) |
# 操作流程
# 执行 update
UPDATE table_update SET type = 10; | |
-- 执行五秒后,执行 | |
mysql> select concat('kill ', id, ';') from information_schema.processlist where Command != 'Sleep' and Time > 5 order by Time desc; | |
+--------------------------+ | |
| concat('kill ', id, ';') | | |
+--------------------------+ | |
| kill 190588; | | |
+--------------------------+ | |
-- 再执行 | |
kill 190588; | |
-- 此时 | |
mysql> UPDATE table_update SET type = 10; | |
ERROR 2013 (HY000): Lost connection to MySQL server during query | |
No connection. Trying to reconnect... | |
Connection id: 190589 | |
Current database: log_analysis |
发现 cpu 还在持续占用
-- 查询 2 | |
mysql> SHOW PROCESSLIST; | |
+--------+------+-----------------------+--------------+---------+------+----------+-----------------------------------+ | |
| Id | User | Host | db | Command | Time | State | Info | | |
+--------+------+-----------------------+--------------+---------+------+----------+-----------------------------------+ | |
| 190578 | root | 172.17.0.1:51578 | log_analysis | Query | 0 | starting | SHOW PROCESSLIST | | |
| 190580 | root | 172.17.0.1:47540 | log_analysis | Sleep | 101 | | NULL | | |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep | 677 | | NULL | | |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep | 677 | | NULL | | |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep | 677 | | NULL | | |
| 190589 | root | 172.17.0.1:52630 | log_analysis | Query | 101 | updating | UPDATE table_update SET type = 10 | | |
+--------+------+-----------------------+--------------+---------+------+----------+-----------------------------------+ | |
6 rows in set (0.00 sec) | |
-- kill 2 | |
mysql> kill 190589; | |
Query OK, 0 rows affected (0.00 sec) | |
-- 查询 3 | |
mysql> SHOW PROCESSLIST; | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
| Id | User | Host | db | Command | Time | State | Info | | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
| 190578 | root | 172.17.0.1:51578 | log_analysis | Query | 0 | starting | SHOW PROCESSLIST | | |
| 190580 | root | 172.17.0.1:47540 | log_analysis | Sleep | 113 | | NULL | | |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep | 689 | | NULL | | |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep | 689 | | NULL | | |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep | 689 | | NULL | | |
| 190589 | root | 172.17.0.1:52630 | log_analysis | Killed | 113 | query end | UPDATE table_update SET type = 10 | | |
| 190590 | root | 172.17.0.1:38560 | log_analysis | Query | 5 | updating | UPDATE table_update SET type = 10 | | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
7 rows in set (0.00 sec) | |
-- 查询 4 | |
mysql> SHOW PROCESSLIST; | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
| Id | User | Host | db | Command | Time | State | Info | | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
| 190578 | root | 172.17.0.1:51578 | log_analysis | Query | 0 | starting | SHOW PROCESSLIST | | |
| 190580 | root | 172.17.0.1:47540 | log_analysis | Sleep | 137 | | NULL | | |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep | 713 | | NULL | | |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep | 713 | | NULL | | |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep | 713 | | NULL | | |
| 190589 | root | 172.17.0.1:52630 | log_analysis | Killed | 137 | query end | UPDATE table_update SET type = 10 | | |
| 190590 | root | 172.17.0.1:38560 | log_analysis | Query | 29 | updating | UPDATE table_update SET type = 10 | | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
7 rows in set (0.00 sec) | |
-- kill 4 | |
mysql> kill 190590; | |
Query OK, 0 rows affected (0.00 sec) | |
-- 最后的查询 | |
mysql> SHOW PROCESSLIST; | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
| Id | User | Host | db | Command | Time | State | Info | | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
| 190578 | root | 172.17.0.1:51578 | log_analysis | Query | -1 | starting | SHOW PROCESSLIST | | |
| 190580 | root | 172.17.0.1:47540 | log_analysis | Sleep | 212 | | NULL | | |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep | 22 | | NULL | | |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep | 22 | | NULL | | |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep | 22 | | NULL | | |
| 190589 | root | 172.17.0.1:52630 | log_analysis | Killed | 212 | query end | UPDATE table_update SET type = 10 | | |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+ | |
6 rows in set (0.00 sec) |
# 校验数据
mysql> SELECT * FROM table_update WHERE (type < 7 and type != (num%10) ) or(type = 7 and (num%10) not in (7,8,9) ) limit 100; | |
Empty set (15.69 sec) |
最后,数据没啥问题
# 总结
mysql innodb 默认开启 autocommit,所以哪怕是单条 update 也会有事务,所以在事务未完成完成的情况下,数据大概率不会有什么问题。
但是 update 全表会有一个表级的锁,会导致其他事务无法对该表进行操作,本次模拟没有测试这种情况,主要研究数据是否会更改成功。
至于 kill 失败的问题。状态显示为 Killed
,并且处于 query end
状态。这通常是因为该查询正在等待某些资源或锁释放,或者正在进行某些清理操作,这些操作可能需要一些时间。
至于本次操作事故,已经过去几个月了,在发现问题的第一时间,找有权限的人 kill 的那个 query 时,也是像本次模拟一样,kill 不掉,后面更高权限的人直接终止了所有 query,回滚了数据库,我们一群人校验数据到半夜三更,好在没有引起更大的问题。
感谢所有小伙伴!感谢数据库备份!感谢冗余数据!
参考资料:https://xiaolincoding.com/mysql/lock/update_index.html# 如何避免这种事故的发生