mirror of
https://github.com/morgan9e/systemd
synced 2026-04-15 00:47:10 +09:00
util, utf8: make ellipsize take multi-byte characters into account
rename old versions to ascii_* Do not take into account zerowidth characters, but do consider double-wide characters. Import needed utf8 helper code from glib. v3: rebase ontop of utf8 restructuring work [zj: tweak the algorithm a bit, move new code to separate file]
This commit is contained in:
committed by
Zbigniew Jędrzejewski-Szmek
parent
14a9283eb3
commit
f405e86de3
@@ -683,6 +683,8 @@ libsystemd_shared_la_SOURCES = \
|
||||
src/shared/exit-status.h \
|
||||
src/shared/utf8.c \
|
||||
src/shared/utf8.h \
|
||||
src/shared/gunicode.c \
|
||||
src/shared/gunicode.h \
|
||||
src/shared/pager.c \
|
||||
src/shared/pager.h \
|
||||
src/shared/ioprio.h \
|
||||
|
||||
4
TODO
4
TODO
@@ -17,10 +17,6 @@ Bugfixes:
|
||||
|
||||
* properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point.
|
||||
|
||||
* ellipsize_mem must take into account multi-byte unicode characters, and
|
||||
- make the resulting line the requested number of *characters*, not *bytes*,
|
||||
- avoid truncuating multi-byte sequences in the middle.
|
||||
|
||||
* When we detect invalid UTF-8, we cant't use it in an error message:
|
||||
log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue);
|
||||
|
||||
|
||||
108
src/shared/gunicode.c
Normal file
108
src/shared/gunicode.c
Normal file
@@ -0,0 +1,108 @@
|
||||
/* gunicode.c - Unicode manipulation functions
|
||||
*
|
||||
* Copyright (C) 1999, 2000 Tom Tromey
|
||||
* Copyright 2000, 2005 Red Hat, Inc.
|
||||
*/
|
||||
|
||||
#include "gunicode.h"
|
||||
|
||||
#define unichar uint32_t
|
||||
|
||||
/**
|
||||
* g_utf8_prev_char:
|
||||
* @p: a pointer to a position within a UTF-8 encoded string
|
||||
*
|
||||
* Finds the previous UTF-8 character in the string before @p.
|
||||
*
|
||||
* @p does not have to be at the beginning of a UTF-8 character. No check
|
||||
* is made to see if the character found is actually valid other than
|
||||
* it starts with an appropriate byte. If @p might be the first
|
||||
* character of the string, you must use g_utf8_find_prev_char() instead.
|
||||
*
|
||||
* Return value: a pointer to the found character.
|
||||
**/
|
||||
char *
|
||||
utf8_prev_char (const char *p)
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
p--;
|
||||
if ((*p & 0xc0) != 0x80)
|
||||
return (char *)p;
|
||||
}
|
||||
}
|
||||
|
||||
struct Interval
|
||||
{
|
||||
unichar start, end;
|
||||
};
|
||||
|
||||
static int
|
||||
interval_compare (const void *key, const void *elt)
|
||||
{
|
||||
unichar c = (unichar) (long) (key);
|
||||
struct Interval *interval = (struct Interval *)elt;
|
||||
|
||||
if (c < interval->start)
|
||||
return -1;
|
||||
if (c > interval->end)
|
||||
return +1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE:
|
||||
*
|
||||
* The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
|
||||
* generated from the Unicode Character Database's file
|
||||
* extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
|
||||
* in this way:
|
||||
*
|
||||
* ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt
|
||||
*
|
||||
* Last update for Unicode 6.0.
|
||||
*/
|
||||
|
||||
/**
|
||||
* g_unichar_iswide:
|
||||
* @c: a Unicode character
|
||||
*
|
||||
* Determines if a character is typically rendered in a double-width
|
||||
* cell.
|
||||
*
|
||||
* Return value: %TRUE if the character is wide
|
||||
**/
|
||||
bool
|
||||
unichar_iswide (unichar c)
|
||||
{
|
||||
/* See NOTE earlier for how to update this table. */
|
||||
static const struct Interval wide[] = {
|
||||
{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
|
||||
{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
|
||||
{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
|
||||
{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
|
||||
{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
|
||||
{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
|
||||
{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
|
||||
{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
|
||||
0x1F248}, {0x1F250, 0x1F251}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}
|
||||
};
|
||||
|
||||
if (bsearch ((void *)(uintptr_t)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0],
|
||||
interval_compare))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
const char utf8_skip_data[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
|
||||
};
|
||||
28
src/shared/gunicode.h
Normal file
28
src/shared/gunicode.h
Normal file
@@ -0,0 +1,28 @@
|
||||
/* gunicode.h - Unicode manipulation functions
|
||||
*
|
||||
* Copyright (C) 1999, 2000 Tom Tromey
|
||||
* Copyright 2000, 2005 Red Hat, Inc.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
char *utf8_prev_char (const char *p);
|
||||
|
||||
extern const char utf8_skip_data[256];
|
||||
|
||||
/**
|
||||
* g_utf8_next_char:
|
||||
* @p: Pointer to the start of a valid UTF-8 character
|
||||
*
|
||||
* Skips to the next character in a UTF-8 string. The string must be
|
||||
* valid; this macro is as fast as possible, and has no error-checking.
|
||||
* You would use this macro to iterate over a string character by
|
||||
* character. The macro returns the start of the next UTF-8 character.
|
||||
* Before using this macro, use g_utf8_validate() to validate strings
|
||||
* that may contain invalid UTF-8.
|
||||
*/
|
||||
#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const unsigned char *)(p)])
|
||||
|
||||
bool unichar_iswide (uint32_t c);
|
||||
@@ -98,7 +98,7 @@ static int utf8_encoded_expected_len(const char *str) {
|
||||
}
|
||||
|
||||
/* decode one unicode char */
|
||||
static int utf8_encoded_to_unichar(const char *str) {
|
||||
int utf8_encoded_to_unichar(const char *str) {
|
||||
int unichar;
|
||||
int len;
|
||||
int i;
|
||||
|
||||
@@ -35,3 +35,4 @@ char *ascii_filter(const char *s);
|
||||
char *utf16_to_utf8(const void *s, size_t length);
|
||||
|
||||
int utf8_encoded_valid_unichar(const char *str);
|
||||
int utf8_encoded_to_unichar(const char *str);
|
||||
|
||||
@@ -74,6 +74,8 @@
|
||||
#include "env-util.h"
|
||||
#include "fileio.h"
|
||||
#include "device-nodes.h"
|
||||
#include "utf8.h"
|
||||
#include "gunicode.h"
|
||||
|
||||
int saved_argc = 0;
|
||||
char **saved_argv = NULL;
|
||||
@@ -3288,7 +3290,7 @@ int running_in_chroot(void) {
|
||||
a.st_ino != b.st_ino;
|
||||
}
|
||||
|
||||
char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
|
||||
static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
|
||||
size_t x;
|
||||
char *r;
|
||||
|
||||
@@ -3319,6 +3321,80 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne
|
||||
return r;
|
||||
}
|
||||
|
||||
char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
|
||||
size_t x;
|
||||
char *e;
|
||||
const char *i, *j;
|
||||
unsigned k, len, len2;
|
||||
|
||||
assert(s);
|
||||
assert(percent <= 100);
|
||||
assert(new_length >= 3);
|
||||
|
||||
/* if no multibyte characters use ascii_ellipsize_mem for speed */
|
||||
if (ascii_is_valid(s))
|
||||
return ascii_ellipsize_mem(s, old_length, new_length, percent);
|
||||
|
||||
if (old_length <= 3 || old_length <= new_length)
|
||||
return strndup(s, old_length);
|
||||
|
||||
x = (new_length * percent) / 100;
|
||||
|
||||
if (x > new_length - 3)
|
||||
x = new_length - 3;
|
||||
|
||||
k = 0;
|
||||
for (i = s; k < x && i < s + old_length; i = utf8_next_char(i)) {
|
||||
int c;
|
||||
|
||||
c = utf8_encoded_to_unichar(i);
|
||||
if (c < 0)
|
||||
return NULL;
|
||||
k += unichar_iswide(c) ? 2 : 1;
|
||||
}
|
||||
|
||||
if (k > x) /* last character was wide and went over quota */
|
||||
x ++;
|
||||
|
||||
for (j = s + old_length; k < new_length && j > i; ) {
|
||||
int c;
|
||||
|
||||
j = utf8_prev_char(j);
|
||||
c = utf8_encoded_to_unichar(j);
|
||||
if (c < 0)
|
||||
return NULL;
|
||||
k += unichar_iswide(c) ? 2 : 1;
|
||||
}
|
||||
assert(i <= j);
|
||||
|
||||
/* we don't actually need to ellipsize */
|
||||
if (i == j)
|
||||
return memdup(s, old_length + 1);
|
||||
|
||||
/* make space for ellipsis */
|
||||
j = utf8_next_char(j);
|
||||
|
||||
len = i - s;
|
||||
len2 = s + old_length - j;
|
||||
e = new(char, len + 3 + len2 + 1);
|
||||
if (!e)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
printf("old_length=%zu new_length=%zu x=%zu len=%u len2=%u k=%u\n",
|
||||
old_length, new_length, x, len, len2, k);
|
||||
*/
|
||||
|
||||
memcpy(e, s, len);
|
||||
e[len] = 0xe2; /* tri-dot ellipsis: … */
|
||||
e[len + 1] = 0x80;
|
||||
e[len + 2] = 0xa6;
|
||||
|
||||
memcpy(e + len + 3, j, len2 + 1);
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
char *ellipsize(const char *s, size_t length, unsigned percent) {
|
||||
return ellipsize_mem(s, strlen(s), length, percent);
|
||||
}
|
||||
|
||||
@@ -405,6 +405,7 @@ static inline const char *ansi_highlight_off(void) {
|
||||
int running_in_chroot(void);
|
||||
|
||||
char *ellipsize(const char *s, size_t length, unsigned percent);
|
||||
/* bytes columns */
|
||||
char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent);
|
||||
|
||||
int touch(const char *path);
|
||||
|
||||
Reference in New Issue
Block a user