# 全表 update

# 背景

前几天在写 php 脚本的时候,复制别人的脚本。结果我改写过程中,update 的条件漏了 where 条件,当时就 throw error 了,按照我的经验我以为执行已经结束了,后面实际并没有执行结束。

所以说,我对一个,线上,非常关键的表,量级大概 20g 数据,4 千万条数据的表执行了全表 update,并且执行了近乎一个小时。

按照我对 mysql 事务的理解,一个 update 应该是一个事务,哪怕执行了一个小时,只要没有完全执行成功,都是不会实际更改表数据的。

所以,我打算复现一下四千万级别的数据,执行全表 update ,执行一会之后再 kill,校验数据是否正确,验证是否数据会有异常

# 初始流程

# 表结构

CREATE TABLE `table_update` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `num` int(11) NOT NULL DEFAULT '0',
  `md5` varchar(16) NOT NULL DEFAULT '',
  `type` int(11) NOT NULL DEFAULT '0',
  `pid` int(11) NOT NULL DEFAULT '0',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4;

其中,md5 为 num 的 md5,type 为 num%10,如果 num%10 >7 这 type =7

mysql 机器为 4c8g 的小主机,500g 固态硬盘。

Server version: 5.7.42 MySQL Community Server (GPL)

# 导入数据

package main
import (
	"crypto/md5"
	"database/sql"
	"encoding/hex"
	"fmt"
	"runtime"
	"strconv"
	"sync"
	_ "github.com/go-sql-driver/mysql"
)
var DB *sql.DB
var lock sync.Mutex
type Data struct {
	Num     int
	MD5     string
	TypeNum int
}
func calculateMD5(num int) string {
	hasher := md5.New()
	hasher.Write([]byte(strconv.Itoa(num)))
	return hex.EncodeToString(hasher.Sum(nil))[:16]
}
func connectToDB() (*sql.DB, error) {
	db, err := sql.Open("mysql", "root:pass@tcp(localhost:3306)/log_analysis")
	return db, err
}
func main() {
	runtime.GOMAXPROCS(4)
	var err error
	DB, err = connectToDB()
	if err != nil {
		fmt.Println(err)
		return
	}
	defer DB.Close()
	var maxID sql.NullInt64
	err = DB.QueryRow("SELECT MAX(id) FROM table_update").Scan(&maxID)
	if err != nil {
		fmt.Println(err)
		return
	}
	start := 1
	if maxID.Valid {
		start = int(maxID.Int64) + 1
	}
	dataChan := make(chan Data, 1000)
	var wg sync.WaitGroup
	for i := 0; i < 10; i++ {
		wg.Add(1)
		go func(pid int) {
			defer wg.Done()
			for data := range dataChan {
				err := insertToDb(data.Num, data.TypeNum, pid, data.MD5)
				if err != nil {
					fmt.Println(err)
					continue
				}
			}
		}(i)
	}
	go func() {
		for i := start + 1; i <= 40000000; i++ {
			num := i
			md5 := calculateMD5(num)
			typeNum := num % 10
			if typeNum > 7 {
				typeNum = 7
			}
			dataChan <- Data{Num: num, MD5: md5, TypeNum: typeNum}
		}
		close(dataChan)
	}()
	wg.Wait()
}
func insertToDb(num, typeNum, pid int, md5 string) error {
	// lock.Lock()
	// defer lock.Unlock()
	if num % 10000 == 0 {
		fmt.Println("Inserting", num, md5, typeNum, pid)
	}
	_, err := DB.Exec("INSERT INTO table_update (num, md5, type, pid) VALUES (?, ?, ?, ?)", num, md5, typeNum, pid)
	return err
}

# 验证数据

mysql> select count(1) from table_update;
+----------+
| count(1) |
+----------+
| 39999999 |
+----------+
1 row in set (5.82 sec)
mysql> SELECT * FROM table_update WHERE (type < 7 and type != (num%10) ) or(type = 7 and (num%10) not in (7,8,9) ) limit 100;
Empty set (13.12 sec)

# 操作流程

# 执行 update

UPDATE table_update SET type = 10;
-- 执行五秒后,执行
mysql> select concat('kill ', id, ';') from information_schema.processlist where Command != 'Sleep' and Time > 5 order by Time desc;
+--------------------------+
| concat('kill ', id, ';') |
+--------------------------+
| kill 190588;             |
+--------------------------+
-- 再执行 
kill 190588;
-- 此时
mysql> UPDATE table_update SET type = 10;
ERROR 2013 (HY000): Lost connection to MySQL server during query
No connection. Trying to reconnect...
Connection id:    190589
Current database: log_analysis

发现 cpu 还在持续占用

-- 查询 2
mysql> SHOW PROCESSLIST;
+--------+------+-----------------------+--------------+---------+------+----------+-----------------------------------+
| Id     | User | Host                  | db           | Command | Time | State    | Info                              |
+--------+------+-----------------------+--------------+---------+------+----------+-----------------------------------+
| 190578 | root | 172.17.0.1:51578      | log_analysis | Query   |    0 | starting | SHOW PROCESSLIST                  |
| 190580 | root | 172.17.0.1:47540      | log_analysis | Sleep   |  101 |          | NULL                              |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep   |  677 |          | NULL                              |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep   |  677 |          | NULL                              |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep   |  677 |          | NULL                              |
| 190589 | root | 172.17.0.1:52630      | log_analysis | Query   |  101 | updating | UPDATE table_update SET type = 10 |
+--------+------+-----------------------+--------------+---------+------+----------+-----------------------------------+
6 rows in set (0.00 sec)
-- kill 2
mysql> kill 190589;
Query OK, 0 rows affected (0.00 sec)
-- 查询 3
mysql> SHOW PROCESSLIST;
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
| Id     | User | Host                  | db           | Command | Time | State     | Info                              |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
| 190578 | root | 172.17.0.1:51578      | log_analysis | Query   |    0 | starting  | SHOW PROCESSLIST                  |
| 190580 | root | 172.17.0.1:47540      | log_analysis | Sleep   |  113 |           | NULL                              |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep   |  689 |           | NULL                              |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep   |  689 |           | NULL                              |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep   |  689 |           | NULL                              |
| 190589 | root | 172.17.0.1:52630      | log_analysis | Killed  |  113 | query end | UPDATE table_update SET type = 10 |
| 190590 | root | 172.17.0.1:38560      | log_analysis | Query   |    5 | updating  | UPDATE table_update SET type = 10 |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
7 rows in set (0.00 sec)
-- 查询 4
mysql> SHOW PROCESSLIST;
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
| Id     | User | Host                  | db           | Command | Time | State     | Info                              |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
| 190578 | root | 172.17.0.1:51578      | log_analysis | Query   |    0 | starting  | SHOW PROCESSLIST                  |
| 190580 | root | 172.17.0.1:47540      | log_analysis | Sleep   |  137 |           | NULL                              |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep   |  713 |           | NULL                              |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep   |  713 |           | NULL                              |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep   |  713 |           | NULL                              |
| 190589 | root | 172.17.0.1:52630      | log_analysis | Killed  |  137 | query end | UPDATE table_update SET type = 10 |
| 190590 | root | 172.17.0.1:38560      | log_analysis | Query   |   29 | updating  | UPDATE table_update SET type = 10 |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
7 rows in set (0.00 sec)
-- kill 4
mysql> kill 190590;
Query OK, 0 rows affected (0.00 sec)
-- 最后的查询
mysql> SHOW PROCESSLIST;
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
| Id     | User | Host                  | db           | Command | Time | State     | Info                              |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
| 190578 | root | 172.17.0.1:51578      | log_analysis | Query   |   -1 | starting  | SHOW PROCESSLIST                  |
| 190580 | root | 172.17.0.1:47540      | log_analysis | Sleep   |  212 |           | NULL                              |
| 190584 | root | 192.168.102.225:11749 | log_analysis | Sleep   |   22 |           | NULL                              |
| 190585 | root | 192.168.102.225:11750 | log_analysis | Sleep   |   22 |           | NULL                              |
| 190586 | root | 192.168.102.225:11769 | log_analysis | Sleep   |   22 |           | NULL                              |
| 190589 | root | 172.17.0.1:52630      | log_analysis | Killed  |  212 | query end | UPDATE table_update SET type = 10 |
+--------+------+-----------------------+--------------+---------+------+-----------+-----------------------------------+
6 rows in set (0.00 sec)

# 校验数据

mysql> SELECT * FROM table_update WHERE (type < 7 and type != (num%10) ) or(type = 7 and (num%10) not in (7,8,9) ) limit 100;
Empty set (15.69 sec)

最后,数据没啥问题

# 总结

mysql innodb 默认开启 autocommit,所以哪怕是单条 update 也会有事务,所以在事务未完成完成的情况下,数据大概率不会有什么问题。

但是 update 全表会有一个表级的锁,会导致其他事务无法对该表进行操作,本次模拟没有测试这种情况,主要研究数据是否会更改成功。

至于 kill 失败的问题。状态显示为 Killed ,并且处于 query end 状态。这通常是因为该查询正在等待某些资源或锁释放,或者正在进行某些清理操作,这些操作可能需要一些时间。

至于本次操作事故,已经过去几个月了,在发现问题的第一时间,找有权限的人 kill 的那个 query 时,也是像本次模拟一样,kill 不掉,后面更高权限的人直接终止了所有 query,回滚了数据库,我们一群人校验数据到半夜三更,好在没有引起更大的问题。

感谢所有小伙伴!感谢数据库备份!感谢冗余数据!

参考资料:https://xiaolincoding.com/mysql/lock/update_index.html# 如何避免这种事故的发生

-->