SquirrelConfig/others/script/rime/sort.go

package rime

import (
	"bufio"
	"crypto/sha1"
	"encoding/hex"
	"fmt"
	mapset "github.com/deckarep/golang-set/v2"
	"io"
	"log"
	"os"
	"path"
	"sort"
	"strconv"
	"strings"
	"time"
)

func Sort(dictPath string, _type int) {
	// 控制台输出
	defer updateVersion(dictPath, getSha1(dictPath))
	defer printfTimeCost("排序 "+path.Base(dictPath), time.Now())

	// 打开文件
	file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
	if err != nil {
		log.Fatalln(err)
	}
	defer file.Close()

	// 前缀内容切片、词库切片，前者原封不动写入，后者排序后写入
	prefixContents := make([]string, 0) // 前置内容
	contents := make([]lemma, 0)        // 词库
	aSet := mapset.NewSet[string]()     // 去重用的的 set

	isMark := false
	sc := bufio.NewScanner(file)
	for sc.Scan() {
		line := sc.Text()

		if !isMark {
			// mark 之前的写入 prefixContents
			prefixContents = append(prefixContents, line)
			if strings.HasPrefix(line, mark) {
				isMark = true
			}
			continue
		}

		// 分割为 text code weight
		parts := strings.Split(line, "\t")
		text, code, weight := parts[0], "", ""

		// 检查分割长度
		if (_type == 1 || _type == 2 || _type == 3) && len(parts) != _type {
			log.Println("分割错误123")
		}
		if _type == 4 && len(parts) != 2 {
			fmt.Println("分割错误4")
		}

		// 将 base 中注释了但没删除的词汇的权重调为 0
		if dictPath == BasePath && strings.HasPrefix(line, "# ") {
			parts[2] = "0"
		}

		// mark 之后的，写入到 contents
		// 词库自身有重复内容则直接排除，不重复的写入到 contents
		switch _type {
		case 1: // 一列 【汉字】
			if aSet.Contains(text) {
				fmt.Println("重复：", line)
				continue
			}
			aSet.Add(text)
			contents = append(contents, lemma{text: text})
		case 2: // 两列 【汉字+注音】
			text, code = parts[0], parts[1]
			if aSet.Contains(text + code) {
				fmt.Println("重复：", line)
				continue
			}
			aSet.Add(text + code)
			contents = append(contents, lemma{text: text, code: code})
		case 3: // 三列 【汉字+注音+权重】
			text, code, weight = parts[0], parts[1], parts[2]
			if aSet.Contains(text + code) {
				fmt.Println("重复：", line)
				continue
			}
			aSet.Add(text + code)
			weight, _ := strconv.Atoi(weight)
			contents = append(contents, lemma{text: text, code: code, weight: weight})
		case 4: // 两列 【汉字+权重】
			text, weight = parts[0], parts[1]
			if aSet.Contains(text) {
				fmt.Println("重复：", line)
				continue
			}
			aSet.Add(text)
			weight, _ := strconv.Atoi(weight)
			contents = append(contents, lemma{text: text, weight: weight})
		}
	}
	if err := sc.Err(); err != nil {
		log.Fatalln(err)
	}

	// 排序：拼音升序，权重降序，最后直接按 Unicode 编码排序
	// 英文排序不区分大小写
	if strings.Contains(dictPath, "en.dict.yaml") {
		sort.Slice(contents, func(i, j int) bool {
			textI, textJ := strings.ToLower(contents[i].text), strings.ToLower(contents[j].text)
			if strings.HasPrefix(textI, "# ") {
				textI = textI[2:]
			}
			if strings.HasPrefix(textJ, "# ") {
				textJ = textJ[2:]
			}
			if textI != textJ {
				return textI < textJ
			}
			return false
		})
	} else {
		sort.Slice(contents, func(i, j int) bool {
			if contents[i].code != contents[j].code {
				return contents[i].code < contents[j].code
			}
			if contents[i].weight != contents[j].weight {
				return contents[i].weight > contents[j].weight
			}
			if contents[i].text != contents[j].text {
				return contents[i].text < contents[j].text
			}
			return false
		})
	}

	// 准备写入，清空文件，移动指针到开头
	err = file.Truncate(0)
	if err != nil {
		log.Fatalln(err)
	}
	_, err = file.Seek(0, 0)
	if err != nil {
		log.Fatalln(err)
	}

	// 写入前缀
	for _, line := range prefixContents {
		_, err := file.WriteString(line + "\n")
		if err != nil {
			log.Fatalln(err)
		}
	}

	// 字表、base 直接写入，不需要从其他词库去重
	if dictPath == HanziPath || dictPath == BasePath {
		for _, line := range contents {
			_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
			if err != nil {
				log.Fatalln(err)
			}
		}
	}

	// ext tencent 词库需要从一个或多个词库中去重后再写入
	if dictPath == ExtPath || dictPath == TencentPath {
		var intersect mapset.Set[string] // 交集，有交集的就是重复的
		switch dictPath {
		case ExtPath:
			intersect = ExtSet.Intersect(BaseSet)
		case TencentPath:
			intersect = TencentSet.Intersect(BaseSet.Union(ExtSet))
		}

		for _, line := range contents {
			if intersect.Contains(line.text) {
				fmt.Printf("%s 重复于其他词库：%s\n", strings.Split(path.Base(dictPath), ".")[0], line.text)
				continue
			}
			var s string
			if line.code != "" {
				s = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"
			} else {
				s = line.text + "\t" + strconv.Itoa(line.weight) + "\n"
			}
			_, err := file.WriteString(s)
			if err != nil {
				log.Fatalln(err)
			}
		}
	}

	// 外部词库或临时文件，只排序，不去重
	if !contains([]string{HanziPath, BasePath, ExtPath, TencentPath}, dictPath) {
		switch _type {
		case 1:
			for _, line := range contents {
				_, err := file.WriteString(line.text + "\n")
				if err != nil {
					log.Fatalln(err)
				}
			}
		case 2:
			for _, line := range contents {
				_, err := file.WriteString(line.text + "\t" + line.code + "\n")
				if err != nil {
					log.Fatalln(err)
				}
			}
		case 3:
			for _, line := range contents {
				_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
				if err != nil {
					log.Fatalln(err)
				}
			}
		case 4:
			for _, line := range contents {
				_, err := file.WriteString(line.text + "\t" + strconv.Itoa(line.weight) + "\n")
				if err != nil {
					log.Fatalln(err)
				}
			}
		}
	}

	if err := file.Sync(); err != nil {
		log.Fatalln(err)
	}
}

// 获取文件 sha1
func getSha1(dictPath string) string {
	f, err := os.Open(dictPath)
	if err != nil {
		log.Fatal(err)
	}
	defer f.Close()

	sha1Handle := sha1.New()
	if _, err := io.Copy(sha1Handle, f); err != nil {
		log.Fatal(err)
	}

	return hex.EncodeToString(sha1Handle.Sum(nil))
}

// 排序后，如果文件有改动，则修改 version 日期，并输出 "...sorted"
func updateVersion(dictPath string, oldSha1 string) {
	// 判断文件是否有改变
	newSha1 := getSha1(dictPath)
	if newSha1 == oldSha1 {
		fmt.Println()
		return
	}
	fmt.Println(" ...sorted")

	// 打开文件
	file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	// 修改那一行
	arr := make([]string, 0)
	sc := bufio.NewScanner(file)
	for sc.Scan() {
		line := sc.Text()
		if strings.HasPrefix(line, "version:") {
			s := fmt.Sprintf("version: \"%s\"", time.Now().Format("2006-01-02"))
			arr = append(arr, s)
		} else {
			arr = append(arr, line)
		}
	}

	// 重新写入
	err = file.Truncate(0)
	if err != nil {
		log.Fatal(err)
	}
	_, err = file.Seek(0, 0)
	if err != nil {
		log.Fatal(err)
	}
	for _, line := range arr {
		_, err := file.WriteString(line + "\n")
		if err != nil {
			log.Fatal(err)
		}
	}

	err = file.Sync()
	if err != nil {
		log.Fatal(err)
	}
}
初始化 2024-03-20 03:18:06 +00:00			`package rime`

			`import (`
			`"bufio"`
			`"crypto/sha1"`
			`"encoding/hex"`
			`"fmt"`
			`mapset "github.com/deckarep/golang-set/v2"`
			`"io"`
			`"log"`
			`"os"`
			`"path"`
			`"sort"`
			`"strconv"`
			`"strings"`
			`"time"`
			`)`

			`func Sort(dictPath string, _type int) {`
			`// 控制台输出`
			`defer updateVersion(dictPath, getSha1(dictPath))`
			`defer printfTimeCost("排序 "+path.Base(dictPath), time.Now())`

			`// 打开文件`
			`file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`defer file.Close()`

			`// 前缀内容切片、词库切片，前者原封不动写入，后者排序后写入`
			`prefixContents := make([]string, 0) // 前置内容`
			`contents := make([]lemma, 0) // 词库`
			`aSet := mapset.NewSet[string]() // 去重用的的 set`

			`isMark := false`
			`sc := bufio.NewScanner(file)`
			`for sc.Scan() {`
			`line := sc.Text()`

			`if !isMark {`
			`// mark 之前的写入 prefixContents`
			`prefixContents = append(prefixContents, line)`
			`if strings.HasPrefix(line, mark) {`
			`isMark = true`
			`}`
			`continue`
			`}`

			`// 分割为 text code weight`
			`parts := strings.Split(line, "\t")`
			`text, code, weight := parts[0], "", ""`

			`// 检查分割长度`
			`if (_type == 1 \|\| _type == 2 \|\| _type == 3) && len(parts) != _type {`
			`log.Println("分割错误123")`
			`}`
			`if _type == 4 && len(parts) != 2 {`
			`fmt.Println("分割错误4")`
			`}`

			`// 将 base 中注释了但没删除的词汇的权重调为 0`
			`if dictPath == BasePath && strings.HasPrefix(line, "# ") {`
			`parts[2] = "0"`
			`}`

			`// mark 之后的，写入到 contents`
			`// 词库自身有重复内容则直接排除，不重复的写入到 contents`
			`switch _type {`
			`case 1: // 一列【汉字】`
			`if aSet.Contains(text) {`
			`fmt.Println("重复：", line)`
			`continue`
			`}`
			`aSet.Add(text)`
			`contents = append(contents, lemma{text: text})`
			`case 2: // 两列【汉字+注音】`
			`text, code = parts[0], parts[1]`
			`if aSet.Contains(text + code) {`
			`fmt.Println("重复：", line)`
			`continue`
			`}`
			`aSet.Add(text + code)`
			`contents = append(contents, lemma{text: text, code: code})`
			`case 3: // 三列【汉字+注音+权重】`
			`text, code, weight = parts[0], parts[1], parts[2]`
			`if aSet.Contains(text + code) {`
			`fmt.Println("重复：", line)`
			`continue`
			`}`
			`aSet.Add(text + code)`
			`weight, _ := strconv.Atoi(weight)`
			`contents = append(contents, lemma{text: text, code: code, weight: weight})`
			`case 4: // 两列【汉字+权重】`
			`text, weight = parts[0], parts[1]`
			`if aSet.Contains(text) {`
			`fmt.Println("重复：", line)`
			`continue`
			`}`
			`aSet.Add(text)`
			`weight, _ := strconv.Atoi(weight)`
			`contents = append(contents, lemma{text: text, weight: weight})`
			`}`
			`}`
			`if err := sc.Err(); err != nil {`
			`log.Fatalln(err)`
			`}`

			`// 排序：拼音升序，权重降序，最后直接按 Unicode 编码排序`
			`// 英文排序不区分大小写`
			`if strings.Contains(dictPath, "en.dict.yaml") {`
			`sort.Slice(contents, func(i, j int) bool {`
			`textI, textJ := strings.ToLower(contents[i].text), strings.ToLower(contents[j].text)`
			`if strings.HasPrefix(textI, "# ") {`
			`textI = textI[2:]`
			`}`
			`if strings.HasPrefix(textJ, "# ") {`
			`textJ = textJ[2:]`
			`}`
			`if textI != textJ {`
			`return textI < textJ`
			`}`
			`return false`
			`})`
			`} else {`
			`sort.Slice(contents, func(i, j int) bool {`
			`if contents[i].code != contents[j].code {`
			`return contents[i].code < contents[j].code`
			`}`
			`if contents[i].weight != contents[j].weight {`
			`return contents[i].weight > contents[j].weight`
			`}`
			`if contents[i].text != contents[j].text {`
			`return contents[i].text < contents[j].text`
			`}`
			`return false`
			`})`
			`}`

			`// 准备写入，清空文件，移动指针到开头`
			`err = file.Truncate(0)`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`_, err = file.Seek(0, 0)`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`

			`// 写入前缀`
			`for _, line := range prefixContents {`
			`_, err := file.WriteString(line + "\n")`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`

			`// 字表、base 直接写入，不需要从其他词库去重`
			`if dictPath == HanziPath \|\| dictPath == BasePath {`
			`for _, line := range contents {`
			`_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`
			`}`

			`// ext tencent 词库需要从一个或多个词库中去重后再写入`
			`if dictPath == ExtPath \|\| dictPath == TencentPath {`
			`var intersect mapset.Set[string] // 交集，有交集的就是重复的`
			`switch dictPath {`
			`case ExtPath:`
			`intersect = ExtSet.Intersect(BaseSet)`
			`case TencentPath:`
			`intersect = TencentSet.Intersect(BaseSet.Union(ExtSet))`
			`}`

			`for _, line := range contents {`
			`if intersect.Contains(line.text) {`
			`fmt.Printf("%s 重复于其他词库：%s\n", strings.Split(path.Base(dictPath), ".")[0], line.text)`
			`continue`
			`}`
			`var s string`
			`if line.code != "" {`
			`s = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"`
			`} else {`
			`s = line.text + "\t" + strconv.Itoa(line.weight) + "\n"`
			`}`
			`_, err := file.WriteString(s)`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`
			`}`

			`// 外部词库或临时文件，只排序，不去重`
			`if !contains([]string{HanziPath, BasePath, ExtPath, TencentPath}, dictPath) {`
			`switch _type {`
			`case 1:`
			`for _, line := range contents {`
			`_, err := file.WriteString(line.text + "\n")`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`
			`case 2:`
			`for _, line := range contents {`
			`_, err := file.WriteString(line.text + "\t" + line.code + "\n")`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`
			`case 3:`
			`for _, line := range contents {`
			`_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`
			`case 4:`
			`for _, line := range contents {`
			`_, err := file.WriteString(line.text + "\t" + strconv.Itoa(line.weight) + "\n")`
			`if err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`
			`}`
			`}`

			`if err := file.Sync(); err != nil {`
			`log.Fatalln(err)`
			`}`
			`}`

			`// 获取文件 sha1`
			`func getSha1(dictPath string) string {`
			`f, err := os.Open(dictPath)`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
			`defer f.Close()`

			`sha1Handle := sha1.New()`
			`if _, err := io.Copy(sha1Handle, f); err != nil {`
			`log.Fatal(err)`
			`}`

			`return hex.EncodeToString(sha1Handle.Sum(nil))`
			`}`

			`// 排序后，如果文件有改动，则修改 version 日期，并输出 "...sorted"`
			`func updateVersion(dictPath string, oldSha1 string) {`
			`// 判断文件是否有改变`
			`newSha1 := getSha1(dictPath)`
			`if newSha1 == oldSha1 {`
			`fmt.Println()`
			`return`
			`}`
			`fmt.Println(" ...sorted")`

			`// 打开文件`
			`file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
			`defer file.Close()`

			`// 修改那一行`
			`arr := make([]string, 0)`
			`sc := bufio.NewScanner(file)`
			`for sc.Scan() {`
			`line := sc.Text()`
			`if strings.HasPrefix(line, "version:") {`
			`s := fmt.Sprintf("version: \"%s\"", time.Now().Format("2006-01-02"))`
			`arr = append(arr, s)`
			`} else {`
			`arr = append(arr, line)`
			`}`
			`}`

			`// 重新写入`
			`err = file.Truncate(0)`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
			`_, err = file.Seek(0, 0)`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
			`for _, line := range arr {`
			`_, err := file.WriteString(line + "\n")`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
			`}`

			`err = file.Sync()`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
			`}`