3rdparty/utf8proc: Updated to 2.9.0.

This commit is contained in:
Vas Crabb 2023-12-06 07:05:45 +11:00
parent 12590d6ad8
commit 466c450cb3
28 changed files with 13449 additions and 12383 deletions

View File

@ -0,0 +1,23 @@
name: CIFuzz
on: [pull_request]
jobs:
Fuzzing:
runs-on: ubuntu-latest
steps:
- name: Build Fuzzers
uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
with:
oss-fuzz-project-name: 'utf8proc'
dry-run: false
- name: Run Fuzzers
uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
with:
oss-fuzz-project-name: 'utf8proc'
fuzz-seconds: 600
dry-run: false
- name: Upload Crash
uses: actions/upload-artifact@v1
if: failure()
with:
name: artifacts
path: ./out/artifacts

View File

@ -0,0 +1,64 @@
name: CMake
on:
push:
branches:
- master
- 'release-*'
pull_request:
# run on all pr
jobs:
build:
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
shared: ["ON", "OFF"]
runs-on: ${{ matrix.os }}
name: ${{ matrix.os }} - shared=${{ matrix.shared }}
steps:
- uses: actions/checkout@v2
- name: Build
run: |
mkdir build
cmake -S . -B build -DBUILD_SHARED_LIBS=${{ matrix.shared }} -DUTF8PROC_ENABLE_TESTING=ON
cmake --build build
- name: Run Test
run: ctest --test-dir build -V
- name: Upload shared lib
if: matrix.shared == 'ON'
uses: actions/upload-artifact@v2
with:
name: ${{ matrix.os }}
path: |
build/libutf8proc.*
build/Debug/utf8proc.*
mingw:
strategy:
matrix:
os: [windows-latest]
shared: ["ON", "OFF"]
runs-on: ${{ matrix.os }}
name: mingw64 - shared=${{ matrix.shared }}
defaults:
run:
shell: msys2 {0}
steps:
- uses: actions/checkout@v2
- uses: msys2/setup-msys2@v2
with:
install: gcc make mingw-w64-x86_64-cmake
- name: Build
run: |
mkdir build
cmake -S . -B build -DBUILD_SHARED_LIBS=${{ matrix.shared }} -DUTF8PROC_ENABLE_TESTING=ON -G'MSYS Makefiles'
cmake --build build
- name: Run Test
run: ctest --test-dir build -V
- name: Upload shared lib
if: matrix.shared == 'ON'
uses: actions/upload-artifact@v2
with:
name: windows-mingw64
path: build/libutf8proc.*

View File

@ -0,0 +1,41 @@
name: Make
on:
push:
branches:
- master
- 'release-*'
pull_request:
# run on all pr
jobs:
build:
strategy:
matrix:
os: [ubuntu-latest, macOS-latest]
runs-on: ${{ matrix.os }}
name: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
# TODO: update makefile to check MANIFEST
# - name: Install dependencies (MacOS)
# if: matrix.config.os == 'macos-latest'
# run: brew install ruby findutils
- name: Check MANIFEST
if: matrix.config.os == 'ubuntu-latest'
run: make manifest && diff MANIFEST.new MANIFEST
- name: Run Test
run: make check
- name: Check utf8proc_data.c
run: make data && diff data/utf8proc_data.c.new utf8proc_data.c
- name: Clean
run: make clean && git status --ignored --porcelain && test -z "$(git status --ignored --porcelain)"
- name: Make lib
run: make
- name: Upload shared lib
uses: actions/upload-artifact@v2
with:
name: make-${{ matrix.os }}
path: libutf8proc.*

38
3rdparty/utf8proc/.gitignore vendored Normal file
View File

@ -0,0 +1,38 @@
*.tar.gz
*.exe
*.dll
*.do
*.o
*.so*
*.a
*.dll
*.dylib
*.dSYM
*.out
*.new
.vscode
/data/*.txt
/data/*.ttf
/data/*.sfd
/docs/
/bench/bench
/bench/icu
/bench/unistring
/test/normtest
/test/graphemetest
/test/printproperty
/test/charwidth
/test/misc
/test/valid
/test/iterate
/test/case
/test/iscase
/test/custom
/tmp/
/mingw_static/
/mingw_shared/
/msvc_shared/
/msvc_static/
/build/
NEWS-update.jl
libutf8proc.pc

View File

@ -1,22 +0,0 @@
language: c
compiler:
- gcc
- clang
notifications:
email: false
before_install:
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia fontforge -y
script:
- make manifest && diff MANIFEST.new MANIFEST
- make check
- make data && diff data/utf8proc_data.c.new utf8proc_data.c
- make clean && git status --ignored --porcelain && test -z "$(git status --ignored --porcelain)"
- (mkdir build_static && cd build_static && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON && make)
- (mkdir build_shared && cd build_shared && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON && make)
env:
# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org)
# so that Travis builds do not depend on anyone's flaky servers but our own
- URLCACHE=https://cache.julialang.org/ CFLAGS="-O2 -Werror -Wmissing-prototypes"

View File

@ -1,20 +1,24 @@
cmake_minimum_required (VERSION 2.8.12)
cmake_minimum_required (VERSION 3.0.0)
include (utils.cmake)
disallow_intree_builds()
project (utf8proc C)
if (POLICY CMP0048)
cmake_policy (SET CMP0048 NEW)
endif ()
project (utf8proc VERSION 2.9.0 LANGUAGES C)
# This is the ABI version number, which may differ from the
# API version number (defined in utf8proc.h).
# API version number (defined in utf8proc.h and above).
# Be sure to also update these in Makefile and MANIFEST!
set(SO_MAJOR 2)
set(SO_MINOR 4)
set(SO_PATCH 1)
set(SO_MAJOR 3)
set(SO_MINOR 0)
set(SO_PATCH 0)
option(UTF8PROC_INSTALL "Enable installation of utf8proc" On)
option(UTF8PROC_ENABLE_TESTING "Enable testing of utf8proc" Off)
option(LIB_FUZZING_ENGINE "Fuzzing engine to link against" Off)
add_library (utf8proc
utf8proc.c
@ -50,23 +54,23 @@ set_target_properties (utf8proc PROPERTIES
)
if (UTF8PROC_INSTALL)
include(GNUInstallDirs)
install(FILES utf8proc.h DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}")
install(TARGETS utf8proc
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
install(
FILES
"${PROJECT_SOURCE_DIR}/utf8proc.h"
DESTINATION include)
ARCHIVE DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}"
LIBRARY DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_FULL_BINDIR}"
)
configure_file(libutf8proc.pc.cmakein libutf8proc.pc @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libutf8proc.pc" DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}/pkgconfig")
endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 13.0.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt data/GraphemeBreakTest.txt SHOW_PROGRESS)
set(UNICODE_VERSION 15.1.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
target_link_libraries(case utf8proc)
add_executable(custom test/tests.h test/tests.c utf8proc.h test/custom.c)
@ -98,4 +102,12 @@ if(UTF8PROC_ENABLE_TESTING)
target_link_libraries(normtest utf8proc)
add_test(utf8proc.testgraphemetest graphemetest data/GraphemeBreakTest.txt)
add_test(utf8proc.testnormtest normtest data/NormalizationTest.txt)
if(LIB_FUZZING_ENGINE)
add_executable(fuzzer utf8proc.h test/fuzzer.c)
target_link_libraries(fuzzer ${LIB_FUZZING_ENGINE} utf8proc)
else()
add_executable(fuzzer utf8proc.h test/fuzz_main.c test/fuzzer.c)
target_link_libraries(fuzzer utf8proc)
endif()
endif()

View File

@ -1,4 +1,4 @@
# Doxyfile 1.8.18
# Doxyfile 1.9.1
# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project.
@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.
PROJECT_NAME = "utf8proc"
PROJECT_NAME = utf8proc
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
@ -217,6 +217,14 @@ QT_AUTOBRIEF = NO
MULTILINE_CPP_IS_BRIEF = NO
# By default Python docstrings are displayed as preformatted text and doxygen's
# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
# doxygen's special commands can be used and the contents of the docstring
# documentation blocks is shown as doxygen documentation.
# The default value is: YES.
PYTHON_DOCSTRING = YES
# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
# documentation from any documented member that it re-implements.
# The default value is: YES.
@ -305,7 +313,10 @@ OPTIMIZE_OUTPUT_SLICE = NO
# Note: For files without extension you can use no_extension as a placeholder.
#
# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
# the files are not read by doxygen.
# the files are not read by doxygen. When specifying no_extension you should add
# * to the FILE_PATTERNS.
#
# Note see also the list of default file extension mappings.
EXTENSION_MAPPING =
@ -439,6 +450,19 @@ TYPEDEF_HIDES_STRUCT = NO
LOOKUP_CACHE_SIZE = 0
# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
# during processing. When set to 0 doxygen will based this on the number of
# cores available in the system. You can set it explicitly to a value larger
# than 0 to get more control over the balance between CPU load and processing
# speed. At this moment only the input processing can be done using multiple
# threads. Since this is still an experimental feature the default is set to 1,
# which efficively disables parallel processing. Please report any issues you
# encounter. Generating dot graphs in parallel is controlled by the
# DOT_NUM_THREADS setting.
# Minimum value: 0, maximum value: 32, default value: 1.
NUM_PROC_THREADS = 1
#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------
@ -502,6 +526,13 @@ EXTRACT_LOCAL_METHODS = NO
EXTRACT_ANON_NSPACES = NO
# If this flag is set to YES, the name of an unnamed parameter in a declaration
# will be determined by the corresponding definition. By default unnamed
# parameters remain unnamed in the output.
# The default value is: YES.
RESOLVE_UNNAMED_PARAMS = YES
# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
# undocumented members inside documented classes or files. If set to NO these
# members will be included in the various overviews, but no documentation
@ -539,11 +570,18 @@ HIDE_IN_BODY_DOCS = NO
INTERNAL_DOCS = NO
# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
# names in lower-case letters. If set to YES, upper-case letters are also
# allowed. This is useful if you have classes or files whose names only differ
# in case and if your file system supports case sensitive file names. Windows
# (including Cygwin) ands Mac users are advised to set this option to NO.
# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
# able to match the capabilities of the underlying filesystem. In case the
# filesystem is case sensitive (i.e. it supports files in the same directory
# whose names only differ in casing), the option must be set to YES to properly
# deal with such files in case they appear in the input. For filesystems that
# are not case sensitive the option should be be set to NO to properly deal with
# output files written for symbols that only differ in casing, such as for two
# classes, one named CLASS and the other named Class, and to also support
# references to files without having to specify the exact matching casing. On
# Windows (including Cygwin) and MacOS, users should typically set this option
# to NO, whereas on Linux or other Unix flavors it should typically be set to
# YES.
# The default value is: system dependent.
CASE_SENSE_NAMES = NO
@ -782,7 +820,10 @@ WARN_IF_DOC_ERROR = YES
WARN_NO_PARAMDOC = NO
# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
# a warning is encountered.
# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
# at the end of the doxygen process doxygen will return with a non-zero status.
# Possible values are: NO, YES and FAIL_ON_WARNINGS.
# The default value is: NO.
WARN_AS_ERROR = NO
@ -818,8 +859,8 @@ INPUT =
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
# possible encodings.
# documentation (see:
# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
# The default value is: UTF-8.
INPUT_ENCODING = UTF-8
@ -832,13 +873,15 @@ INPUT_ENCODING = UTF-8
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# read by doxygen.
#
# Note the list of default checked file patterns might differ from the list of
# default file extension mappings.
#
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen
# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
# *.vhdl, *.ucf, *.qsf and *.ice.
# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
# *.ucf, *.qsf and *.ice.
FILE_PATTERNS =
@ -1065,13 +1108,6 @@ VERBATIM_HEADERS = YES
ALPHABETICAL_INDEX = YES
# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
# which the alphabetical index list will be split.
# Minimum value: 1, maximum value: 20, default value: 5.
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
COLS_IN_ALPHA_INDEX = 5
# In case all classes in a project start with a common prefix, all classes will
# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
# can be used to specify a prefix (or a list of prefixes) that should be ignored
@ -1242,10 +1278,11 @@ HTML_INDEX_NUM_ENTRIES = 100
# If the GENERATE_DOCSET tag is set to YES, additional index files will be
# generated that can be used as input for Apple's Xcode 3 integrated development
# environment (see: https://developer.apple.com/xcode/), introduced with OSX
# 10.5 (Leopard). To create a documentation set, doxygen will generate a
# Makefile in the HTML output directory. Running make will produce the docset in
# that directory and running make install will install the docset in
# environment (see:
# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
# create a documentation set, doxygen will generate a Makefile in the HTML
# output directory. Running make will produce the docset in that directory and
# running make install will install the docset in
# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
# genXcode/_index.html for more information.
@ -1287,8 +1324,8 @@ DOCSET_PUBLISHER_NAME = Publisher
# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
# Windows.
# (see:
# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
#
# The HTML Help Workshop contains a compiler that can convert all HTML output
# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@ -1318,7 +1355,7 @@ CHM_FILE =
HHC_LOCATION =
# The GENERATE_CHI flag controls if a separate .chi index file is generated
# (YES) or that it should be included in the master .chm file (NO).
# (YES) or that it should be included in the main .chm file (NO).
# The default value is: NO.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
@ -1363,7 +1400,8 @@ QCH_FILE =
# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
# Project output. For more information please see Qt Help Project / Namespace
# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
# (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
# The default value is: org.doxygen.Project.
# This tag requires that the tag GENERATE_QHP is set to YES.
@ -1371,8 +1409,8 @@ QHP_NAMESPACE = org.doxygen.Project
# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
# Help Project output. For more information please see Qt Help Project / Virtual
# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
# folders).
# Folders (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
# The default value is: doc.
# This tag requires that the tag GENERATE_QHP is set to YES.
@ -1380,16 +1418,16 @@ QHP_VIRTUAL_FOLDER = doc
# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
# filter to add. For more information please see Qt Help Project / Custom
# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
# filters).
# Filters (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_NAME =
# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
# custom filter to add. For more information please see Qt Help Project / Custom
# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
# filters).
# Filters (see:
# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_ATTRS =
@ -1401,9 +1439,9 @@ QHP_CUST_FILTER_ATTRS =
QHP_SECT_FILTER_ATTRS =
# The QHG_LOCATION tag can be used to specify the location of Qt's
# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
# generated .qhp file.
# The QHG_LOCATION tag can be used to specify the location (absolute path
# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
# run qhelpgenerator on the generated .qhp file.
# This tag requires that the tag GENERATE_QHP is set to YES.
QHG_LOCATION =
@ -1484,8 +1522,8 @@ EXT_LINKS_IN_WINDOW = NO
# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
# the HTML output. These images will generally look nicer at scaled resolutions.
# Possible values are: png The default and svg Looks nicer but requires the
# pdf2svg tool.
# Possible values are: png (the default) and svg (looks nicer but requires the
# pdf2svg or inkscape tool).
# The default value is: png.
# This tag requires that the tag GENERATE_HTML is set to YES.
@ -1530,7 +1568,7 @@ USE_MATHJAX = NO
# When MathJax is enabled you can set the default output format to be used for
# the MathJax output. See the MathJax site (see:
# http://docs.mathjax.org/en/latest/output.html) for more details.
# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
# Possible values are: HTML-CSS (which is slower, but has the best
# compatibility), NativeMML (i.e. MathML) and SVG.
# The default value is: HTML-CSS.
@ -1560,7 +1598,8 @@ MATHJAX_EXTENSIONS =
# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
# of code that will be used on startup of the MathJax code. See the MathJax site
# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
# (see:
# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
# example see the documentation.
# This tag requires that the tag USE_MATHJAX is set to YES.
@ -1607,7 +1646,8 @@ SERVER_BASED_SEARCH = NO
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
# Xapian (see: https://xapian.org/).
# Xapian (see:
# https://xapian.org/).
#
# See the section "External Indexing and Searching" for details.
# The default value is: NO.
@ -1620,8 +1660,9 @@ EXTERNAL_SEARCH = NO
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
# Xapian (see: https://xapian.org/). See the section "External Indexing and
# Searching" for details.
# Xapian (see:
# https://xapian.org/). See the section "External Indexing and Searching" for
# details.
# This tag requires that the tag SEARCHENGINE is set to YES.
SEARCHENGINE_URL =
@ -1785,9 +1826,11 @@ LATEX_EXTRA_FILES =
PDF_HYPERLINKS = YES
# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
# the PDF file directly from the LaTeX files. Set this option to YES, to get a
# higher quality PDF documentation.
# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
# files. Set this option to YES, to get a higher quality PDF documentation.
#
# See also section LATEX_CMD_NAME for selecting the engine.
# The default value is: YES.
# This tag requires that the tag GENERATE_LATEX is set to YES.
@ -2298,10 +2341,32 @@ UML_LOOK = NO
# but if the number exceeds 15, the total amount of fields shown is limited to
# 10.
# Minimum value: 0, maximum value: 100, default value: 10.
# This tag requires that the tag HAVE_DOT is set to YES.
# This tag requires that the tag UML_LOOK is set to YES.
UML_LIMIT_NUM_FIELDS = 10
# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
# tag is set to YES, doxygen will add type and arguments for attributes and
# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
# will not generate fields with class member information in the UML graphs. The
# class diagrams will look similar to the default class diagrams but using UML
# notation for the relationships.
# Possible values are: NO, YES and NONE.
# The default value is: NO.
# This tag requires that the tag UML_LOOK is set to YES.
DOT_UML_DETAILS = NO
# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
# to display on a single line. If the actual line length exceeds this threshold
# significantly it will wrapped across multiple lines. Some heuristics are apply
# to avoid ugly line breaks.
# Minimum value: 0, maximum value: 1000, default value: 17.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_WRAP_THRESHOLD = 17
# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
# collaboration graphs will show the relations between templates and their
# instances.
@ -2491,9 +2556,11 @@ DOT_MULTI_TARGETS = NO
GENERATE_LEGEND = YES
# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
# files that are used to generate the various graphs.
#
# Note: This setting is not only used for dot files but also for msc and
# plantuml temporary files.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.
DOT_CLEANUP = YES

View File

@ -7,7 +7,7 @@ whose copyright and license statements are reproduced below, all new
work on the utf8proc library is licensed under the [MIT "expat"
license](http://opensource.org/licenses/MIT):
*Copyright © 2014-2019 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
*Copyright © 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),

View File

@ -2,8 +2,8 @@ include/
include/utf8proc.h
lib/
lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.4.1
lib/libutf8proc.so.2 -> libutf8proc.so.2.4.1
lib/libutf8proc.so.2.4.1
lib/libutf8proc.so -> libutf8proc.so.3.0.0
lib/libutf8proc.so.2 -> libutf8proc.so.3.0.0
lib/libutf8proc.so.3.0.0
lib/pkgconfig/
lib/pkgconfig/libutf8proc.pc

View File

@ -11,7 +11,7 @@ PERL=perl
CFLAGS ?= -O2
PICFLAG = -fPIC
C99FLAG = -std=c99
WCFLAGS = -Wall -Wextra -pedantic
WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic
UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
LDFLAG_SHARED = -shared
SOFLAG = -Wl,-soname
@ -22,9 +22,12 @@ SOFLAG = -Wl,-soname
# compatibility is broken, even if the API is backward-compatible.
# The API version number is defined in utf8proc.h.
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
MAJOR=2
MINOR=4
PATCH=1
MAJOR=3
MINOR=0
PATCH=0
# api version (also in utf8proc.h and CMakeLists.txt)
VERSION=2.9.0
OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X
@ -78,7 +81,7 @@ utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
libutf8proc.a: utf8proc.o
rm -f libutf8proc.a
$(AR) rs libutf8proc.a utf8proc.o
$(AR) crs libutf8proc.a utf8proc.o
libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH): utf8proc.o
$(CC) $(LDFLAGS) $(LDFLAG_SHARED) -o $@ $(SOFLAG) -Wl,libutf8proc.so.$(MAJOR) utf8proc.o
@ -168,6 +171,20 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
# make release tarball from master branch
dist:
git archive master --prefix=utf8proc-$(VERSION)/ -o utf8proc-$(VERSION).tar.gz
# build tarball, make sure it passes checks, and make sure version numbers are consistent
distcheck: dist
test `grep UTF8PROC_VERSION utf8proc.h | cut -d' ' -f3 | tr '\n' .` = $(VERSION). || exit 1
test `grep "utf8proc VERSION" CMakeLists.txt |cut -d' ' -f 4` = $(VERSION) || exit 1
test `grep libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) MANIFEST | wc -l` = 3 || exit 1
test `grep 'set(SO_' CMakeLists.txt |cut -d' ' -f2 | tr -d ')' | tr '\n' '.'` = $(MAJOR).$(MINOR).$(PATCH). || exit 1
tar xzf utf8proc-$(VERSION).tar.gz
make -C utf8proc-$(VERSION) check
rm -rf utf8proc-$(VERSION)
check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt

View File

@ -1,5 +1,29 @@
# utf8proc release history #
## Version 2.9.0 ##
2023-10-20
- Unicode 15.1 support ([#253]).
## Version 2.8.0 ##
2022-10-30
- Unicode 15 support ([#247]).
## Version 2.7.0 ##
2021-12-16
- Unicode 14 support ([#233]).
- Support `GNUInstallDirs` in CMake build ([#159]).
- `cmake` build now installs `pkg-config` file ([#224]).
- Various build and portability improvements.
## Version 2.6.1 ##
2020-12-15
@ -409,8 +433,13 @@ Release of version 1.0.1
[#152]: https://github.com/JuliaStrings/utf8proc/issues/152
[#154]: https://github.com/JuliaStrings/utf8proc/issues/154
[#156]: https://github.com/JuliaStrings/utf8proc/issues/156
[#159]: https://github.com/JuliaStrings/utf8proc/issues/159
[#167]: https://github.com/JuliaStrings/utf8proc/issues/167
[#173]: https://github.com/JuliaStrings/utf8proc/issues/173
[#179]: https://github.com/JuliaStrings/utf8proc/issues/179
[#196]: https://github.com/JuliaStrings/utf8proc/issues/196
[#205]: https://github.com/JuliaStrings/utf8proc/issues/205
[#224]: https://github.com/JuliaStrings/utf8proc/issues/224
[#233]: https://github.com/JuliaStrings/utf8proc/issues/233
[#247]: https://github.com/JuliaStrings/utf8proc/issues/247
[#253]: https://github.com/JuliaStrings/utf8proc/issues/253

View File

@ -1,5 +1,5 @@
# utf8proc
[![Travis CI Status](https://travis-ci.org/JuliaStrings/utf8proc.png)](https://travis-ci.org/JuliaStrings/utf8proc)
[![CI](https://github.com/NanoComp/meep/actions/workflows/build-ci.yml/badge.svg)](https://github.com/JuliaStrings/utf8proc/actions/workflows/build-ci.yml)
[![AppVeyor status](https://ci.appveyor.com/api/projects/status/ivaa0v6ikxrmm5r6?svg=true)](https://ci.appveyor.com/project/StevenGJohnson/utf8proc)
[utf8proc](http://juliastrings.github.io/utf8proc/) is a small, clean C
@ -38,9 +38,8 @@ For compilation of the C library, run `make`. You can also install the library
Alternatively, you can compile with `cmake`, e.g. by
```sh
mkdir build
cd build
cmake ..
make
cmake -S . -B build
cmake --build build
```
### Using other compilers
@ -60,7 +59,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).
The Unicode version supported is 13.0.0.
The Unicode version supported is 15.1.0.
For Unicode normalizations, the following options are used:

View File

@ -22,7 +22,7 @@ CharWidths.txt: charwidths.jl EastAsianWidth.txt
$(JULIA) charwidths.jl > $@
# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=13.0.0
UNICODE_VERSION=15.1.0
UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt

57
3rdparty/utf8proc/data/data_generator.rb vendored Normal file → Executable file
View File

@ -67,7 +67,7 @@
# authorization of the copyright holder.
$ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
$ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
$ignorable = []
$ignorable_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
@ -77,7 +77,7 @@ $ignorable_list.each_line do |entry|
end
end
$uppercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]
$uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m]
$uppercase = []
$uppercase_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
@ -87,7 +87,7 @@ $uppercase_list.each_line do |entry|
end
end
$lowercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]
$lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m]
$lowercase = []
$lowercase_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
@ -97,7 +97,33 @@ $lowercase_list.each_line do |entry|
end
end
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
$icb_linker_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
end
end
$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
$icb_consonant_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
end
end
$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
$icb_extend_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
end
end
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
@ -107,7 +133,7 @@ $grapheme_boundclass_list.each_line do |entry|
end
end
$emoji_data_list = File.read("emoji-data.txt")
$emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8')
$emoji_data_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
@ -120,7 +146,7 @@ $emoji_data_list.each_line do |entry|
end
end
$charwidth_list = File.read("CharWidths.txt")
$charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8')
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
@ -130,13 +156,13 @@ $charwidth_list.each_line do |entry|
end
end
$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
$exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m]
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
$excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8')
$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
@ -174,13 +200,13 @@ def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array)
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
if lencode >= 3 #we have only 2 bits for the length
array = [lencode] + array
lencode = 7
lencode = 3
end
idx = pushary(array)
raise "Array index out of bound" if idx > 0x1FFF
return "#{idx | (lencode << 13)}"
raise "Array index out of bound" if idx > 0x3FFF
return "#{idx | (lencode << 14)}"
end
def singlecpmap(cp)
return "UINT16_MAX" if cp == nil
@ -249,7 +275,8 @@ class UnicodeChar
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$charwidth[code]}, 0, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$icb[code]}},\n"
end
end
@ -415,7 +442,7 @@ end
$stdout << "};\n\n"
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
properties.each { |line|
$stdout << line
}

View File

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=@CMAKE_INSTALL_FULL_BINDIR@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: libutf8proc
Description: UTF8 processing
Version: @PROJECT_VERSION@
Libs: -L${libdir} -lutf8proc
Cflags: -I${includedir} -DUTF8PROC_EXPORTS

View File

@ -26,27 +26,27 @@ int main(int argc, char **argv)
++error;
}
if (sizeof(wint_t) > 2 || c < (1<<16)) {
wint_t l0 = towlower(c), u0 = towupper(c);
if (sizeof(wint_t) > 2 || (c < (1<<16) && u < (1<<16) && l < (1<<16))) {
wint_t l0 = towlower((wint_t)c), u0 = towupper((wint_t)c);
/* OS unicode tables may be out of date. But if they
do have a lower/uppercase mapping, hopefully it
is correct? */
if (l0 != c && l0 != l) {
if (l0 != (wint_t)c && l0 != (wint_t)l) {
fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
l, c, l0);
++error;
}
else if (l0 != l) { /* often true for out-of-date OS unicode */
else if (l0 != (wint_t)l) { /* often true for out-of-date OS unicode */
++better;
/* printf("%x != towlower(%x) == %x\n", l, c, l0); */
}
if (u0 != c && u0 != u) {
if (u0 != (wint_t)c && u0 != (wint_t)u) {
fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
u, c, u0);
++error;
}
else if (u0 != u) { /* often true for out-of-date OS unicode */
else if (u0 != (wint_t)u) { /* often true for out-of-date OS unicode */
++better;
/* printf("%x != towupper(%x) == %x\n", u, c, u0); */
}

54
3rdparty/utf8proc/test/fuzz_main.c vendored Normal file
View File

@ -0,0 +1,54 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
/* Fuzz target entry point, works without libFuzzer */
int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
int main(int argc, char **argv)
{
FILE *f;
char *buf = NULL;
long siz_buf;
if(argc < 2)
{
fprintf(stderr, "no input file\n");
goto err;
}
f = fopen(argv[1], "rb");
if(f == NULL)
{
fprintf(stderr, "error opening input file %s\n", argv[1]);
goto err;
}
fseek(f, 0, SEEK_END);
siz_buf = ftell(f);
rewind(f);
if(siz_buf < 1) goto err;
buf = (char*)malloc(siz_buf);
if(buf == NULL)
{
fprintf(stderr, "malloc() failed\n");
goto err;
}
if(fread(buf, siz_buf, 1, f) != 1)
{
fprintf(stderr, "fread() failed\n");
goto err;
}
(void)LLVMFuzzerTestOneInput((uint8_t*)buf, siz_buf);
err:
free(buf);
return 0;
}

84
3rdparty/utf8proc/test/fuzzer.c vendored Normal file
View File

@ -0,0 +1,84 @@
#include <utf8proc.h>
#include <string.h>
int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
{
if(size < 1) return 0;
/* Avoid timeout with long inputs */
if(size > (64 * 1024)) return 0;
if(data[size-1] != '\0') return 0;
const uint8_t* ptr = data;
utf8proc_int32_t c = 0, c_prev = 0, state = 0;
utf8proc_option_t options;
utf8proc_ssize_t ret, bytes = 0;
size_t len = strlen((const char*)data);
while(bytes != len)
{
ret = utf8proc_iterate(ptr, -1, &c);
if(ret < 0 || ret == 0) break;
bytes += ret;
ptr += ret;
utf8proc_tolower(c);
utf8proc_toupper(c);
utf8proc_totitle(c);
utf8proc_islower(c);
utf8proc_isupper(c);
utf8proc_charwidth(c);
utf8proc_category(c);
utf8proc_category_string(c);
utf8proc_codepoint_valid(c);
utf8proc_grapheme_break(c_prev, c);
utf8proc_grapheme_break_stateful(c_prev, c, &state);
c_prev = c;
}
utf8proc_int32_t *copy = size >= 4 ? NULL : malloc(size);
if(copy)
{
size /= 4;
options = UTF8PROC_STRIPCC | UTF8PROC_NLF2LS | UTF8PROC_NLF2PS;
memcpy(copy, data, size);
utf8proc_normalize_utf32(copy, size, options);
options = UTF8PROC_STRIPCC | UTF8PROC_NLF2LS;
memcpy(copy, data, size);
utf8proc_normalize_utf32(copy, size, options);
options = UTF8PROC_STRIPCC | UTF8PROC_NLF2PS;
memcpy(copy, data, size);
utf8proc_normalize_utf32(copy, size, options);
options = UTF8PROC_STRIPCC;
memcpy(copy, data, size);
utf8proc_normalize_utf32(copy, size, options);
options = UTF8PROC_LUMP;
memcpy(copy, data, size);
utf8proc_normalize_utf32(copy, size, options);
options = 0;
memcpy(copy, data, size);
utf8proc_normalize_utf32(copy, size, options);
free(copy);
}
free(utf8proc_NFD(data));
free(utf8proc_NFC(data));
free(utf8proc_NFKD(data));
free(utf8proc_NFKC(data));
free(utf8proc_NFKC_Casefold(data));
return 0;
}

View File

@ -43,7 +43,7 @@ void checkline(const char *_buf, bool verbose) {
else
i++;
}
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND);
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
/* the test file contains surrogate codepoints, which are only for UTF-16 */
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
@ -66,7 +66,7 @@ void checkline(const char *_buf, bool verbose) {
utf8proc_bool expectbreak = false;
do {
utf8proc_int32_t codepoint;
i += utf8proc_iterate(src + i, si - i, &codepoint);
i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint);
check(codepoint >= 0, "invalid UTF-8 data");
if (codepoint == 0x002F)
expectbreak = true;
@ -110,6 +110,7 @@ int main(int argc, char **argv)
utf8proc_uint8_t *g;
glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks");
free(g);
};
@ -118,6 +119,13 @@ int main(int argc, char **argv)
checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
/* more GB9c tests */
checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true);
checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
checkline("/ 0915 0300 094d 0300 / 0078 /", true);
checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");

View File

@ -8,7 +8,7 @@ static int error;
#define CHECKVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,len,__LINE__)
#define CHECKINVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,UTF8PROC_ERROR_INVALIDUTF8,__LINE__)
static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int line)
static void testbytes(utf8proc_uint8_t *buf, utf8proc_ssize_t len, utf8proc_ssize_t retval, int line)
{
utf8proc_int32_t out[16];
utf8proc_ssize_t ret;
@ -16,13 +16,13 @@ static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int
/* Make a copy to ensure that memory is left uninitialized after "len"
* bytes. This way, Valgrind can detect overreads.
*/
unsigned char tmp[16];
memcpy(tmp, buf, len);
utf8proc_uint8_t tmp[16];
memcpy(tmp, buf, (unsigned long int)len);
tests++;
if ((ret = utf8proc_iterate(tmp, len, out)) != retval) {
fprintf(stderr, "Failed (%d):", line);
for (int i = 0; i < len ; i++) {
for (utf8proc_ssize_t i = 0; i < len ; i++) {
fprintf(stderr, " 0x%02x", tmp[i]);
}
fprintf(stderr, " -> %zd\n", ret);
@ -32,8 +32,8 @@ static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int
int main(int argc, char **argv)
{
uint32_t byt;
unsigned char buf[16];
utf8proc_int32_t byt;
utf8proc_uint8_t buf[16];
(void) argc; (void) argv; /* unused */

13
3rdparty/utf8proc/test/ossfuzz.sh vendored Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash -eu
# This script is meant to be run by
# https://github.com/google/oss-fuzz/blob/master/projects/utf8proc/Dockerfile
mkdir build
cd build
cmake .. -DUTF8PROC_ENABLE_TESTING=ON -DLIB_FUZZING_ENGINE="$LIB_FUZZING_ENGINE"
make -j$(nproc)
cp $SRC/utf8proc/build/fuzzer $OUT/utf8proc_fuzzer
find $SRC/utf8proc/test -name "*.txt" | \
xargs zip $OUT/utf8proc_fuzzer_seed_corpus.zip

View File

@ -8,12 +8,14 @@ int main(int argc, char **argv)
for (i = 1; i < argc; ++i) {
utf8proc_uint8_t cstr[16], *map;
unsigned int c;
utf8proc_uint32_t x;
utf8proc_int32_t c;
if (!strcmp(argv[i], "-V")) {
printf("utf8proc version %s\n", utf8proc_version());
continue;
}
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
check(sscanf(argv[i],"%x", &x) == 1, "invalid hex input %s", argv[i]);
c = (utf8proc_int32_t)x;
const utf8proc_property_t *p = utf8proc_get_property(c);
if (utf8proc_codepoint_valid(c))
@ -37,6 +39,7 @@ int main(int argc, char **argv)
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n"
" indic_conjunct_break = %d\n"
" charwidth = %d\n",
argv[i], (char*) cstr,
utf8proc_category_string(c),
@ -53,6 +56,7 @@ int main(int argc, char **argv)
p->ignorable,
p->control_boundary,
p->boundclass,
p->indic_conjunct_break,
utf8proc_charwidth(c));
free(map);
}

View File

@ -29,7 +29,8 @@ size_t skipspaces(const unsigned char *buf, size_t i)
in dest, returning the number of bytes read from buf */
size_t encode(unsigned char *dest, const unsigned char *buf)
{
size_t i = 0, j, d = 0;
size_t i = 0, j;
utf8proc_ssize_t d = 0;
for (;;) {
int c;
i = skipspaces(buf, i);

View File

@ -1,6 +1,6 @@
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
/*
* Copyright (c) 2014-2019 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
}
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "13.0.0";
return "15.1.0";
}
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
@ -125,7 +125,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
) {
utf8proc_uint32_t uc;
utf8proc_int32_t uc;
const utf8proc_uint8_t *end;
*dst = -1;
@ -137,7 +137,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
return 1;
}
// Must be between 0xc2 and 0xf4 inclusive to be valid
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
if (uc < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
true; // GB999
}
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
{
if (state) {
int lbc_override;
if (*state == UTF8PROC_BOUNDCLASS_START)
*state = lbc_override = lbc;
else
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
if (*state == 0) { /* state initialization */
state_bc = lbc;
state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
}
else { /* lbc and licb are already encoded in *state */
state_bc = *state & 0xff; // 1st byte of state is bound class
state_icb = *state >> 8; // 2nd byte of state is indic conjunct break
}
utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
!(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
&& ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
// Special support for GB9c. Don't break between two consonants
// separated 1+ linker characters and 0+ extend characters in any order.
// After a consonant, we enter LINKER state after at least one linker.
if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
state_icb = ticb;
else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
state_bc = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB11 (emoji extend* zwj / emoji)
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
else
*state = tbc;
state_bc = tbc;
}
else
*state = tbc;
state_bc = tbc;
*state = state_bc + (state_icb << 8);
return break_permitted;
}
else
@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
const utf8proc_property_t *p1 = utf8proc_get_property(c1);
const utf8proc_property_t *p2 = utf8proc_get_property(c2);
return grapheme_break_extended(p1->boundclass,
p2->boundclass,
p1->indic_conjunct_break,
p2->indic_conjunct_break,
state);
}
@ -356,9 +379,9 @@ static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
utf8proc_ssize_t written = 0;
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
int len = seqindex >> 13;
if (len >= 7) {
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
int len = seqindex >> 14;
if (len >= 3) {
len = *entry;
entry++;
}
@ -376,19 +399,19 @@ static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqinde
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
{
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
}
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
{
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
}
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
{
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
}
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
@ -410,7 +433,7 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
}
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
return utf8proc_get_property(c)->category;
return (utf8proc_category_t) utf8proc_get_property(c)->category;
}
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
@ -420,7 +443,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass)
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
const utf8proc_property_t *property;
@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
}
if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary;
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
@ -735,7 +758,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
*dstptr = NULL;
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
if (result < 0) return result;
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM;
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
if (result < 0) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2019 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
@ -71,9 +71,9 @@
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 6
#define UTF8PROC_VERSION_MINOR 9
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 1
#define UTF8PROC_VERSION_PATCH 0
/** @} */
#include <stdlib.h>
@ -273,7 +273,8 @@ typedef struct utf8proc_property_struct {
* Boundclass.
* @see utf8proc_boundclass_t.
*/
unsigned boundclass:8;
unsigned boundclass:6;
unsigned indic_conjunct_break:2;
} utf8proc_property_t;
/** Unicode categories. */
@ -388,6 +389,14 @@ typedef enum {
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
} utf8proc_boundclass_t;
/** Indic_Conjunct_Break property. (TR44) */
typedef enum {
UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,
} utf8proc_indic_conjunct_break_t;
/**
* Function pointer type passed to @ref utf8proc_map_custom and
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
@ -481,8 +490,9 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
* - @ref UTF8PROC_STRIPNA - remove unassigned codepoints
* @param last_boundclass
* Pointer to an integer variable containing
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
* option is used. Otherwise, this parameter is ignored.
* the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND
* option is used. If the string is being processed in order, this can be initialized to 0 for
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.
*
* @return
* In case of success, the number of codepoints written is returned; in case

File diff suppressed because it is too large Load Diff

View File

@ -197,7 +197,7 @@ tap-windows.h
Copyright (C) 2002-2014 OpenVPN Technologies, Inc.
utf8proc
Copyright (c) 2014-2015 Steven G. Johnson, Jiahao Chen, Tony Kelman,
Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Tony Kelman,
Jonas Fonseca, and other contributors
Permission is hereby granted, free of charge, to any person obtaining a

View File

@ -1747,11 +1747,6 @@ project "utf8proc"
"verbose=-1",
}
configuration { "gmake or ninja" }
buildoptions_c {
"-Wno-strict-prototypes",
}
configuration { }
files {