I've managed to reproduce the issue locally. In order to reproduce it, you need to:
1) setup a vm with 4 cores and 32G of RAM (or whatever, just ensure the ram is enough).
2) restore the DB taken from the reporter's environment
3) install MAAS 3.2.9 (or a newer version)
4) hammer MAAS with a script like for at least 1 minute (in order to overload MAAS) and keep it running
```
import asyncio
import aiohttp
import time
import uuid
import random
import numpy as np
import json
async def make_http_request(session, http_method, url):
headers = {"Authorization": "OAuth oauth_version=1.0, oauth_signature_method=PLAINTEXT, oauth_consumer_key=<REDACTED>, oauth_token=<REDACTED>, oauth_signature=&<REDACTED>, oauth_nonce=" + str(uuid.uuid4()) + ", oauth_timestamp=" + str(int(time.time()))}
full_url = f"{base_url}{url}"
print(full_url)
try:
body = {}
if http_method == "POST":
body = {'mode': 'AUTO', 'subnet': 2}
async with session.request(http_method, full_url, data=json.dumps(body), headers=headers) as response:
while True: chunk = await response.content.read(512) # Adjust the chunk size as needed
if not chunk: break await asyncio.sleep(0.5) # just slow down the download speed print("OK")
return full_url, response.status
except Exception as e: print("EXCEPTION") print(str(e))
return full_url, str(e)
# Some of the HTTP calls from the reporter's environment
urls = [
("GET", "/MAAS/api/2.0/machines/?agent_name=9a7502fa-3149-4d63-8b92-aec26ae8d7eb&id=hydcaa"), # This is the most problematic one, as usual.
("GET", "/MAAS/api/2.0/version/"),
("GET", "/MAAS/api/2.0/users/?op=whoami"),
("GET", "/MAAS/api/2.0/zones/"),
("GET", "/MAAS/api/2.0/spaces/"),
("GET", "/MAAS/api/2.0/static-routes/"),
("GET", "/MAAS/rpc/"),
("GET", "/MAAS/api/2.0/devices/?hostname=juju-ad2531-5-lxd-8"),
("POST", "/MAAS/api/2.0/nodes/mdyndn/interfaces/519149/?op=link_subnet")
]
async def main():
async with aiohttp.ClientSession() as session:
while True:
# pick a request to shoot according to the probabilities calculated on the reporter's logs
bullet = np.random.choice(range(len(urls)), p=[ 0.207619047619047616, 0.2161904761904762, 0.14761904761904763, 0.02857142857142857, 0.0761904761904762, 0.03333333333333333, 0.047619047619047616, 0.09047619047619047, 0.1523809523809524
])
method, url = urls[bullet]
# sleep a bit between every request.
# the less we sleep, the more you'll hit the issue.
await asyncio.sleep(0.07) asyncio.ensure_future(make_http_request(session, method, url))
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
```
5) run the following bash script
```
#!/bin/bash
for i in {1..6}; do
curl --header "Authorization: OAuth oauth_version=1.0, oauth_signature_method=PLAINTEXT, oauth_consumer_key=<REDACTED> oauth_token=<REDACTED>, oauth_signature=&<REDACTED>, oauth_nonce=$(uuidgen), oauth_timestamp=$(date +%s)" http://10.0.2.46:5240/MAAS/api/2.0/machines/?op=allocate -X POST &
done
wait
```
6) after around 90 seconds, you should get a 409 for a request.
The underlying issue should be that the allocate operation uses a database lock and the request that is accessing the database is so slow that all the others are blocked until the first transaction is committed/aborted.
I think this issue is actually caused by the slowness of the MAAS api. A redesign of MAAS is on the roadmap in order to address this known issue. For the time being my suggestion is to add another region machine, add HA proxy and/or use a more powerful machine (I noticed that the 5 MAAS processes are using from 70% to 100% cpu during high load and this is causing the unpredictable issues you've hit).
I've managed to reproduce the issue locally. In order to reproduce it, you need to:
1) setup a vm with 4 cores and 32G of RAM (or whatever, just ensure the ram is enough).
2) restore the DB taken from the reporter's environment
3) install MAAS 3.2.9 (or a newer version)
4) hammer MAAS with a script like for at least 1 minute (in order to overload MAAS) and keep it running
```
import asyncio
import aiohttp
import time
import uuid
import random
import numpy as np
import json
base_url = 'http:// 10.0.2. 46:5240' # Replace with your MAAS URL
async def make_http_ request( session, http_method, url): _method= PLAINTEXT, oauth_consumer_ key=<REDACTED> , oauth_token= <REDACTED> , oauth_signature =&<REDACTED> , oauth_nonce=" + str(uuid.uuid4()) + ", oauth_timestamp=" + str(int( time.time( )))} request( http_method, full_url, data=json. dumps(body) , headers=headers) as response:
chunk = await response. content. read(512) # Adjust the chunk size as needed
break
await asyncio.sleep(0.5) # just slow down the download speed
print( "OK")
print( "EXCEPTION" )
print( str(e))
headers = {"Authorization": "OAuth oauth_version=1.0, oauth_signature
full_url = f"{base_url}{url}"
print(full_url)
try:
body = {}
if http_method == "POST":
body = {'mode': 'AUTO', 'subnet': 2}
async with session.
while True:
if not chunk:
return full_url, response.status
except Exception as e:
return full_url, str(e)
# Some of the HTTP calls from the reporter's environment api/2.0/ machines/ ?agent_ name=9a7502fa- 3149-4d63- 8b92-aec26ae8d7 eb&id=hydcaa" ), # This is the most problematic one, as usual. api/2.0/ version/ "), api/2.0/ users/? op=whoami" ), api/2.0/ zones/" ), api/2.0/ spaces/ "), api/2.0/ static- routes/ "), api/2.0/ devices/ ?hostname= juju-ad2531- 5-lxd-8" ), api/2.0/ nodes/mdyndn/ interfaces/ 519149/ ?op=link_ subnet" )
urls = [
("GET", "/MAAS/
("GET", "/MAAS/
("GET", "/MAAS/
("GET", "/MAAS/
("GET", "/MAAS/
("GET", "/MAAS/
("GET", "/MAAS/rpc/"),
("GET", "/MAAS/
("POST", "/MAAS/
]
async def main(): ClientSession( ) as session: choice( range(len( urls)), p=[
0.20761904761 9047616,
0.21619047619 04762,
0.14761904761 904763,
0.02857142857 142857,
0.07619047619 04762,
0.03333333333 333333,
0.04761904761 9047616,
0.09047619047 619047,
0.15238095238 09524
asyncio. ensure_ future( make_http_ request( session, method, url))
async with aiohttp.
while True:
# pick a request to shoot according to the probabilities calculated on the reporter's logs
bullet = np.random.
])
method, url = urls[bullet]
# sleep a bit between every request.
# the less we sleep, the more you'll hit the issue.
await asyncio.sleep(0.07)
if __name__ == '__main__': get_event_ loop() run_until_ complete( main())
loop = asyncio.
loop.
```
5) run the following bash script
```
#!/bin/bash
for i in {1..6}; do _method= PLAINTEXT, oauth_consumer_ key=<REDACTED> oauth_token= <REDACTED> , oauth_signature =&<REDACTED> , oauth_nonce= $(uuidgen) , oauth_timestamp =$(date +%s)" http:// 10.0.2. 46:5240/ MAAS/api/ 2.0/machines/ ?op=allocate -X POST &
curl --header "Authorization: OAuth oauth_version=1.0, oauth_signature
done
wait
```
6) after around 90 seconds, you should get a 409 for a request.
The underlying issue should be that the allocate operation uses a database lock and the request that is accessing the database is so slow that all the others are blocked until the first transaction is committed/aborted.
I think this issue is actually caused by the slowness of the MAAS api. A redesign of MAAS is on the roadmap in order to address this known issue. For the time being my suggestion is to add another region machine, add HA proxy and/or use a more powerful machine (I noticed that the 5 MAAS processes are using from 70% to 100% cpu during high load and this is causing the unpredictable issues you've hit).